Files
ai-chatexport/tests/test_providers.py
JesseMarkowitz 473d02f71a feat: v0.4.0 — rich content support with typed blocks and loss visibility
Extracts per-message content into a typed `blocks` list (text, code,
thinking, tool_use, tool_result, image_placeholder, file_placeholder,
unknown) and renders them at exporter write time. Voice transcripts,
Custom Instructions, and image references now appear in exports
instead of being silently dropped.

Foundation:
- src/blocks.py: pure block constructors, _safe_fence (fence-corruption
  defense, verified live in Joplin), _blockquote_prefix, render
- src/loss_report.py: per-run tally surfaced as INFO summary at end of
  export so silently-dropped data becomes visible

Providers:
- ChatGPT: dispatch on content_type produces typed blocks; voice shapes
  (audio_transcription, audio_asset_pointer, real_time_user_audio_video_
  asset_pointer) locked from live DevTools capture; Custom Instructions
  bug fix (parts-vs-direct-fields); role filter lifted; hidden-context
  marker driven by is_visually_hidden_from_conversation flag
- Claude: defensive dispatch for text/thinking/tool_use/tool_result/image
  with recursive nested-block flattening; untested against real rich-
  content data — fix-forward in v0.4.1

Exporter:
- Markdown renders from blocks at write time via render_blocks_to_markdown;
  backward-compat fallback to content for any pre-v0.4.0 cached data

Tests:
- 27 new tests across providers, exporters, CLI; fixtures rebuilt with
  real-shape ChatGPT voice + Custom Instructions cases
- 181/181 pass

Behavior changes (intentional):
- JSON output omits content; consumers should read blocks
- Per-conversation message counts increase (Custom Instructions, image-
  only, tool-only messages now appear)
- Existing exports not auto-re-rendered; users wanting fresh output run
  cache --clear then export

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 23:17:18 -04:00

465 lines
17 KiB
Python

"""Unit tests for src/providers/ using fixture files."""
import json
import logging
from pathlib import Path
import pytest
from src.blocks import (
BLOCK_TYPE_FILE_PLACEHOLDER,
BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
BLOCK_TYPE_IMAGE_PLACEHOLDER,
BLOCK_TYPE_TEXT,
BLOCK_TYPE_THINKING,
BLOCK_TYPE_TOOL_RESULT,
BLOCK_TYPE_TOOL_USE,
BLOCK_TYPE_UNKNOWN,
)
from src.loss_report import LossReport
FIXTURES = Path(__file__).parent / "fixtures"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _block_types(message: dict) -> list[str]:
return [b.get("type") for b in (message.get("blocks") or [])]
def _first_block(message: dict, block_type: str) -> dict | None:
for b in message.get("blocks") or []:
if b.get("type") == block_type:
return b
return None
# ---------------------------------------------------------------------------
# ChatGPT
# ---------------------------------------------------------------------------
class TestChatGPTNormalization:
"""ChatGPT normalize_conversation block-extraction behavior."""
def _get_provider(self):
from src.providers.chatgpt import ChatGPTProvider
p = ChatGPTProvider.__new__(ChatGPTProvider)
import requests
p._session = requests.Session()
p._org_id = None
p._project_ids = []
p._project_map = {}
p._project_name_cache = {}
return p
def test_normalizes_conversation(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assert result["id"] == "chatgpt-conv-001"
assert result["title"] == "Python Async Tutorial"
assert result["provider"] == "chatgpt"
assert result["project"] is None
assert result["created_at"] != ""
assert result["updated_at"] != ""
assert isinstance(result["messages"], list)
def test_normalizes_without_project(self):
raw = json.loads((FIXTURES / "chatgpt_no_project.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assert result["project"] is None
assert result["id"] == "chatgpt-conv-002"
def test_normalizes_with_project_from_map(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
p._project_map["chatgpt-conv-001"] = "My Research Project"
result = p.normalize_conversation(raw)
assert result["project"] == "My Research Project"
def test_text_message_emits_text_block(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
user_msgs = [m for m in result["messages"] if m["role"] == "user"]
# The "How does async/await..." message
async_msgs = [
m for m in user_msgs
if any(
"async" in (b.get("text") or "").lower()
for b in (m.get("blocks") or [])
)
]
assert async_msgs, "expected a user message about async/await"
assert _block_types(async_msgs[0]) == [BLOCK_TYPE_TEXT]
def test_code_block_preserved_with_language(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
# The first assistant message is the async/await answer with a python fence
text_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TEXT)
assert text_block is not None
assert "```python" in text_block["text"]
def test_multimodal_voice_user_message(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
# node-mm-user: audio_transcription "What is the capital of France?"
# + real_time_user_audio_video_asset_pointer wrapping a sediment:// URL
capital_msgs = [
m for m in result["messages"]
if any(
"capital of france" in (b.get("text") or "").lower()
for b in (m.get("blocks") or [])
)
]
assert capital_msgs, "expected the audio_transcription text to surface"
types = _block_types(capital_msgs[0])
assert BLOCK_TYPE_TEXT in types
assert BLOCK_TYPE_FILE_PLACEHOLDER in types
file_block = _first_block(capital_msgs[0], BLOCK_TYPE_FILE_PLACEHOLDER)
assert file_block["ref"].startswith("sediment://")
assert file_block["mime"] == "audio/wav"
assert file_block["size_bytes"] == 50000
assert file_block["duration_seconds"] == pytest.approx(2.5)
def test_multimodal_voice_reverse_order_preserved(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
# node-mm-user-rev has parts in REVERSE order: asset first, transcription second.
rev_msgs = [
m for m in result["messages"]
if any(
"tell me more" in (b.get("text") or "").lower()
for b in (m.get("blocks") or [])
)
]
assert rev_msgs, "expected the reverse-order voice message"
types = _block_types(rev_msgs[0])
# Order preserved: file_placeholder before text
assert types == [BLOCK_TYPE_FILE_PLACEHOLDER, BLOCK_TYPE_TEXT]
def test_image_only_user_message_renders(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
image_msgs = [
m for m in result["messages"]
if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or []))
]
assert image_msgs, "image-only user message should now render"
def test_user_editable_context_emits_blocks(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
# The user_editable_context message has user_profile + user_instructions.
# It should now appear (was silently dropped pre-v0.4.0).
uec_msgs = [
m for m in result["messages"]
if any(
"Custom Instructions" in (b.get("text") or "")
for b in (m.get("blocks") or [])
)
]
assert uec_msgs, "user_editable_context should be visible in output"
# Hidden context marker should be prepended.
assert uec_msgs[0]["blocks"][0]["type"] == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER
def test_user_editable_context_uses_safe_fence(self):
"""The user_instructions value contains embedded triple-backticks; the rendered
Markdown must use a fence longer than 3 backticks so embedded fences are inert.
"""
from src.blocks import render_blocks_to_markdown
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
uec_msgs = [
m for m in result["messages"]
if any(
"Custom Instructions" in (b.get("text") or "")
for b in (m.get("blocks") or [])
)
]
assert uec_msgs
rendered = render_blocks_to_markdown(uec_msgs[0]["blocks"])
# Content has ``` inside, so the wrap fence must be at least 4 backticks.
assert "````" in rendered, "expected a 4+ backtick safe-fence wrap"
def test_message_roles_are_valid(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
for msg in result["messages"]:
assert msg["role"] in ("user", "assistant", "system", "tool")
def test_message_count_matches(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assert result["message_count"] == len(result["messages"])
def test_loss_report_records_messages(self):
raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
p = self._get_provider()
report = LossReport()
result = p.normalize_conversation(raw, report)
assert report.messages_rendered == len(result["messages"])
assert report.conversations == 1
class TestChatGPTUnknownContent:
"""Unrecognised content types should produce visible unknown blocks + WARNING + tally."""
def _get_provider(self):
from src.providers.chatgpt import ChatGPTProvider
p = ChatGPTProvider.__new__(ChatGPTProvider)
import requests
p._session = requests.Session()
p._org_id = None
p._project_ids = []
p._project_map = {}
p._project_name_cache = {}
return p
def _make_unknown_conv(self):
return {
"id": "test-unknown",
"title": "Test",
"create_time": 1700000000.0,
"update_time": 1700000001.0,
"mapping": {
"root": {"id": "root", "message": None, "parent": None, "children": ["msg1"]},
"msg1": {
"id": "msg1",
"message": {
"id": "msg1",
"author": {"role": "user"},
"content": {
"content_type": "future_unknown_type_xyz",
"some_field": "value",
},
},
"parent": "root",
"children": [],
},
},
}
def test_unknown_content_type_produces_unknown_block(self):
p = self._get_provider()
result = p.normalize_conversation(self._make_unknown_conv())
assert any(
b.get("type") == BLOCK_TYPE_UNKNOWN
for m in result["messages"]
for b in (m.get("blocks") or [])
)
def test_unknown_content_type_logs_warning(self, caplog):
p = self._get_provider()
with caplog.at_level(logging.WARNING):
p.normalize_conversation(self._make_unknown_conv())
assert any("future_unknown_type_xyz" in r.message for r in caplog.records)
def test_unknown_content_type_increments_loss_report(self):
p = self._get_provider()
report = LossReport()
p.normalize_conversation(self._make_unknown_conv(), report)
assert report.unknown_blocks["future_unknown_type_xyz"] == 1
# ---------------------------------------------------------------------------
# Claude
# ---------------------------------------------------------------------------
class TestClaudeNormalization:
"""Claude normalize_conversation block-extraction behavior."""
def _get_provider(self):
from src.providers.claude import ClaudeProvider
import requests
p = ClaudeProvider.__new__(ClaudeProvider)
p._session = requests.Session()
p._org_id = None
return p
def test_normalizes_with_project(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assert result["id"] == "claude-conv-001"
assert result["title"] == "StartOS Service Packaging"
assert result["provider"] == "claude"
assert result["project"] == "StarTOS Packaging"
assert result["created_at"] == "2024-06-10T14:32:00.000Z"
def test_normalizes_without_project(self):
raw = json.loads((FIXTURES / "claude_no_project.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assert result["project"] is None
def test_string_content_emits_text_block(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
thanks_msgs = [
m for m in result["messages"]
if any(
"thank you" in (b.get("text") or "").lower()
for b in (m.get("blocks") or [])
)
]
assert thanks_msgs
def test_list_content_emits_blocks_in_order(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
# msg-002 has text + tool_use, in that order.
assert assistant_msgs
types = _block_types(assistant_msgs[0])
assert BLOCK_TYPE_TEXT in types
assert BLOCK_TYPE_TOOL_USE in types
# Order preserved
assert types.index(BLOCK_TYPE_TEXT) < types.index(BLOCK_TYPE_TOOL_USE)
def test_tool_use_block_fields(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
tool_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TOOL_USE)
assert tool_block["name"] == "search"
assert tool_block["input"] == {"query": "startOS docs"}
assert tool_block["tool_id"] == "tool-001"
def test_image_block_emits_image_placeholder(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
msg004 = [
m for m in result["messages"]
if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or []))
]
assert msg004
img = _first_block(msg004[0], BLOCK_TYPE_IMAGE_PLACEHOLDER)
assert img["ref"] == "claude-image-uuid-1"
def test_unknown_block_type_records_loss(self):
from src.blocks import BLOCK_TYPE_UNKNOWN as _UNK
raw = {
"uuid": "test-unknown",
"name": "T",
"chat_messages": [
{
"uuid": "m1",
"sender": "human",
"content": [{"type": "future_block_xyz", "data": "..."}],
}
],
}
p = self._get_provider()
report = LossReport()
result = p.normalize_conversation(raw, report)
assert any(
b.get("type") == _UNK
for m in result["messages"]
for b in (m.get("blocks") or [])
)
assert report.unknown_blocks["future_block_xyz"] == 1
def test_thinking_block(self):
raw = {
"uuid": "thinking-test",
"name": "T",
"chat_messages": [
{
"uuid": "m1",
"sender": "assistant",
"content": [
{"type": "thinking", "thinking": "Let me reason about this."},
{"type": "text", "text": "Here's the answer."},
],
}
],
}
p = self._get_provider()
result = p.normalize_conversation(raw)
types = _block_types(result["messages"][0])
assert BLOCK_TYPE_THINKING in types
assert BLOCK_TYPE_TEXT in types
def test_tool_result_with_nested_text_blocks(self):
raw = {
"uuid": "tool-result-test",
"name": "T",
"chat_messages": [
{
"uuid": "m1",
"sender": "assistant",
"content": [
{
"type": "tool_result",
"tool_use_id": "tool-001",
"content": [
{"type": "text", "text": "search hit 1"},
{"type": "text", "text": "search hit 2"},
],
"is_error": False,
}
],
}
],
}
p = self._get_provider()
result = p.normalize_conversation(raw)
tool_result = _first_block(result["messages"][0], BLOCK_TYPE_TOOL_RESULT)
assert tool_result is not None
assert "search hit 1" in tool_result["output"]
assert "search hit 2" in tool_result["output"]
assert tool_result["is_error"] is False
def test_human_sender_maps_to_user(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
result = p.normalize_conversation(raw)
roles = {m["role"] for m in result["messages"]}
assert "user" in roles
assert "human" not in roles
def test_loss_report_messages_recorded(self):
raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
p = self._get_provider()
report = LossReport()
result = p.normalize_conversation(raw, report)
assert report.messages_rendered == len(result["messages"])