feat: v0.4.0 — rich content support with typed blocks and loss visibility

Extracts per-message content into a typed `blocks` list (text, code, thinking, tool_use, tool_result, image_placeholder, file_placeholder, unknown) and renders them at exporter write time. Voice transcripts, Custom Instructions, and image references now appear in exports instead of being silently dropped. Foundation: - src/blocks.py: pure block constructors, _safe_fence (fence-corruption defense, verified live in Joplin), _blockquote_prefix, render - src/loss_report.py: per-run tally surfaced as INFO summary at end of export so silently-dropped data becomes visible Providers: - ChatGPT: dispatch on content_type produces typed blocks; voice shapes (audio_transcription, audio_asset_pointer, real_time_user_audio_video_ asset_pointer) locked from live DevTools capture; Custom Instructions bug fix (parts-vs-direct-fields); role filter lifted; hidden-context marker driven by is_visually_hidden_from_conversation flag - Claude: defensive dispatch for text/thinking/tool_use/tool_result/image with recursive nested-block flattening; untested against real rich- content data — fix-forward in v0.4.1 Exporter: - Markdown renders from blocks at write time via render_blocks_to_markdown; backward-compat fallback to content for any pre-v0.4.0 cached data Tests: - 27 new tests across providers, exporters, CLI; fixtures rebuilt with real-shape ChatGPT voice + Custom Instructions cases - 181/181 pass Behavior changes (intentional): - JSON output omits content; consumers should read blocks - Per-conversation message counts increase (Custom Instructions, image- only, tool-only messages now appear) - Existing exports not auto-re-rendered; users wanting fresh output run cache --clear then export Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 23:17:18 -04:00
parent 4798edcea7
commit 473d02f71a
16 changed files with 1786 additions and 232 deletions
--- a/tests/fixtures/chatgpt_conversation.json
+++ b/tests/fixtures/chatgpt_conversation.json
@@ -8,12 +8,30 @@
    "node-root": {
      "id": "node-root",
      "parent": null,
-      "children": ["node-1"],
+      "children": ["node-uec"],
      "message": null
    },
+    "node-uec": {
+      "id": "node-uec",
+      "parent": "node-root",
+      "children": ["node-1"],
+      "message": {
+        "id": "node-uec",
+        "author": {"role": "user"},
+        "create_time": null,
+        "content": {
+          "content_type": "user_editable_context",
+          "user_profile": "Preferred name: Jesse",
+          "user_instructions": "The user provided the additional info about how they would like you to respond:\n```Always cite sources.```"
+        },
+        "metadata": {
+          "is_visually_hidden_from_conversation": true
+        }
+      }
+    },
    "node-1": {
      "id": "node-1",
-      "parent": "node-root",
+      "parent": "node-uec",
      "children": ["node-2"],
      "message": {
        "id": "node-1",
@@ -28,7 +46,7 @@
    "node-2": {
      "id": "node-2",
      "parent": "node-1",
-      "children": ["node-3"],
+      "children": ["node-mm-user"],
      "message": {
        "id": "node-2",
        "author": {"role": "assistant"},
@@ -39,17 +57,71 @@
        }
      }
    },
-    "node-3": {
-      "id": "node-3",
+    "node-mm-user": {
+      "id": "node-mm-user",
      "parent": "node-2",
-      "children": [],
+      "children": ["node-mm-assistant"],
      "message": {
-        "id": "node-3",
+        "id": "node-mm-user",
        "author": {"role": "user"},
        "create_time": 1704067300.0,
        "content": {
-          "content_type": "image_asset_pointer",
-          "parts": [{"content_type": "image_asset_pointer", "asset_pointer": "file://some-image"}]
+          "content_type": "multimodal_text",
+          "parts": [
+            {"content_type": "audio_transcription", "text": "What is the capital of France?", "direction": "in", "decoding_id": null},
+            {"content_type": "real_time_user_audio_video_asset_pointer", "frames_asset_pointers": [], "video_container_asset_pointer": null, "audio_asset_pointer": {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_user001", "size_bytes": 50000, "format": "wav", "metadata": {"start": 0.0, "end": 2.5}}, "audio_start_timestamp": 1.0}
+          ]
+        },
+        "metadata": {"voice_mode_message": true}
+      }
+    },
+    "node-mm-assistant": {
+      "id": "node-mm-assistant",
+      "parent": "node-mm-user",
+      "children": ["node-mm-user-rev"],
+      "message": {
+        "id": "node-mm-assistant",
+        "author": {"role": "assistant"},
+        "create_time": 1704067305.0,
+        "content": {
+          "content_type": "multimodal_text",
+          "parts": [
+            {"content_type": "audio_transcription", "text": "The capital of France is Paris.", "direction": "out", "decoding_id": null},
+            {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_assistant001", "size_bytes": 80000, "format": "wav", "metadata": {"start": 0.0, "end": 3.2}}
+          ]
+        }
+      }
+    },
+    "node-mm-user-rev": {
+      "id": "node-mm-user-rev",
+      "parent": "node-mm-assistant",
+      "children": ["node-image-only"],
+      "message": {
+        "id": "node-mm-user-rev",
+        "author": {"role": "user"},
+        "create_time": 1704067400.0,
+        "content": {
+          "content_type": "multimodal_text",
+          "parts": [
+            {"content_type": "real_time_user_audio_video_asset_pointer", "frames_asset_pointers": [], "video_container_asset_pointer": null, "audio_asset_pointer": {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_user002", "size_bytes": 30000, "format": "wav", "metadata": {"start": 0.0, "end": 1.5}}, "audio_start_timestamp": 5.0},
+            {"content_type": "audio_transcription", "text": "Tell me more please.", "direction": "in", "decoding_id": null}
+          ]
+        }
+      }
+    },
+    "node-image-only": {
+      "id": "node-image-only",
+      "parent": "node-mm-user-rev",
+      "children": [],
+      "message": {
+        "id": "node-image-only",
+        "author": {"role": "user"},
+        "create_time": 1704067500.0,
+        "content": {
+          "content_type": "multimodal_text",
+          "parts": [
+            {"content_type": "image_asset_pointer", "asset_pointer": "file-service://image001"}
+          ]
        }
      }
    }
--- a/tests/fixtures/claude_conversation.json
+++ b/tests/fixtures/claude_conversation.json
@@ -30,6 +30,15 @@
      "sender": "human",
      "created_at": "2024-06-10T14:45:00.000Z",
      "content": "Thank you, that helped!"
+    },
+    {
+      "uuid": "msg-004",
+      "sender": "human",
+      "created_at": "2024-06-10T14:50:00.000Z",
+      "content": [
+        {"type": "text", "text": "What about this image?"},
+        {"type": "image", "source": {"file_uuid": "claude-image-uuid-1", "media_type": "image/png"}}
+      ]
    }
  ]
 }
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -127,3 +127,50 @@ class TestExportSinceValidation:
            },
        )
        assert "Invalid --since date" not in result.output
+
+
+# ---------------------------------------------------------------------------
+# LossReport summary
+# ---------------------------------------------------------------------------
+
+
+class TestLossReportSummary:
+    """The LossReport's format_summary() pinned format covers zero, top-5, and overflow cases."""
+
+    def test_zero_summary_uses_none_sentinel(self):
+        from src.loss_report import LossReport
+
+        report = LossReport()
+        out = report.format_summary()
+        assert "[export] Run summary:" in out
+        assert "conversations:        0" in out
+        assert "messages rendered:    0" in out
+        # Both "(none)" sentinels present — never empty parens
+        assert out.count("(none)") == 2
+
+    def test_top_5_breakdown(self):
+        from src.loss_report import LossReport
+
+        report = LossReport()
+        for raw_type in ("a", "b", "c", "d", "e", "f", "g"):
+            report.record_unknown(raw_type)
+            if raw_type == "a":
+                # Make 'a' the most common
+                for _ in range(4):
+                    report.record_unknown("a")
+        out = report.format_summary()
+        # Top entry shown
+        assert "a=5" in out
+        # Overflow line present (7 types, top 5 + 2 more)
+        assert "+ 2 more types" in out
+
+    def test_messages_and_conversations_recorded(self):
+        from src.loss_report import LossReport
+
+        report = LossReport()
+        report.record_conversation()
+        report.record_message()
+        report.record_message()
+        out = report.format_summary()
+        assert "conversations:        1" in out
+        assert "messages rendered:    2" in out
--- a/tests/test_exporters.py
+++ b/tests/test_exporters.py
@@ -1,4 +1,4 @@
-"""Unit tests for src/exporters/."""
+"""Unit tests for src/exporters/ and src/blocks.py."""

 import json
 import os
@@ -7,6 +7,23 @@ from pathlib import Path

 import pytest

+from src.blocks import (
+    BLOCK_TYPE_TEXT,
+    UNKNOWN_REASON_EXTRACTION_FAILED,
+    UNKNOWN_REASON_UNKNOWN_TYPE,
+    _blockquote_prefix,
+    _safe_fence,
+    make_code_block,
+    make_file_placeholder,
+    make_hidden_context_marker,
+    make_image_placeholder,
+    make_text_block,
+    make_thinking_block,
+    make_tool_result_block,
+    make_tool_use_block,
+    make_unknown_block,
+    render_blocks_to_markdown,
+)
 from src.exporters.markdown import MarkdownExporter, _yaml_escape, _format_timestamp
 from src.exporters.json_export import JSONExporter

@@ -250,3 +267,240 @@ class TestFormatTimestamp:

    def test_empty_string(self):
        assert _format_timestamp("") == ""
+
+
+# ---------------------------------------------------------------------------
+# Block helpers and rendering
+# ---------------------------------------------------------------------------
+
+
+class TestSafeFence:
+    def test_minimum_three_backticks(self):
+        assert _safe_fence("plain text") == "```"
+
+    def test_four_backticks_when_three_in_content(self):
+        assert _safe_fence("here ``` is a fence") == "````"
+
+    def test_five_backticks_when_four_in_content(self):
+        assert _safe_fence("here ```` is four") == "`````"
+
+    def test_handles_empty_string(self):
+        assert _safe_fence("") == "```"
+
+    def test_handles_run_at_end(self):
+        # Trailing run still counted
+        assert _safe_fence("text ending in ```") == "````"
+
+
+class TestBlockquotePrefix:
+    def test_single_line(self):
+        assert _blockquote_prefix("hello") == "> hello"
+
+    def test_multi_line(self):
+        assert _blockquote_prefix("a\nb\nc") == "> a\n> b\n> c"
+
+    def test_empty_lines_become_naked_quote_marker(self):
+        assert _blockquote_prefix("a\n\nb") == "> a\n>\n> b"
+
+    def test_empty_string(self):
+        assert _blockquote_prefix("") == ">"
+
+
+class TestBlockConstructors:
+    def test_make_text_block_returns_none_for_empty(self):
+        assert make_text_block("") is None
+        assert make_text_block("   ") is None
+
+    def test_make_text_block_returns_dict(self):
+        b = make_text_block("hello")
+        assert b == {"type": "text", "text": "hello"}
+
+    def test_make_code_block_returns_none_for_empty(self):
+        assert make_code_block("") is None
+
+    def test_make_thinking_block_returns_none_for_empty(self):
+        assert make_thinking_block("") is None
+
+
+class TestRenderBlocks:
+    def test_text_block_renders_as_paragraph(self):
+        out = render_blocks_to_markdown([make_text_block("Hello world")])
+        assert out == "Hello world"
+
+    def test_blocks_separated_by_blank_line(self):
+        out = render_blocks_to_markdown(
+            [make_text_block("first"), make_text_block("second")]
+        )
+        assert out == "first\n\nsecond"
+
+    def test_code_block_with_language(self):
+        out = render_blocks_to_markdown([make_code_block("print(1)", language="python")])
+        assert "```python" in out
+        assert "print(1)" in out
+
+    def test_thinking_block_uses_blockquote(self):
+        out = render_blocks_to_markdown([make_thinking_block("step 1\nstep 2")])
+        assert "**💭 Reasoning**" in out
+        assert "> step 1" in out
+        assert "> step 2" in out
+
+    def test_tool_use_renders_as_blockquote_with_safe_fence(self):
+        out = render_blocks_to_markdown(
+            [make_tool_use_block("search", {"query": "test"})]
+        )
+        assert "> 🔧 **Tool: search**" in out
+        # Every line of the body is blockquote-prefixed
+        assert "> ```json" in out
+        assert "> }" in out
+
+    def test_tool_use_with_multiline_input(self):
+        out = render_blocks_to_markdown(
+            [make_tool_use_block("complex", {"a": 1, "b": [{"x": "y"}]})]
+        )
+        # Prefix every line of multi-line JSON
+        for line in out.split("\n"):
+            assert line.startswith(">") or line == ""
+
+    def test_tool_result_success_uses_outbox_icon(self):
+        out = render_blocks_to_markdown([make_tool_result_block("OK")])
+        assert "📤 **Result**" in out
+        assert "❌" not in out
+
+    def test_tool_result_error_uses_x_icon(self):
+        out = render_blocks_to_markdown([make_tool_result_block("oops", is_error=True)])
+        assert "❌ **Result (error)**" in out
+        assert "📤" not in out
+
+    def test_image_placeholder_rendering(self):
+        out = render_blocks_to_markdown(
+            [make_image_placeholder(ref="file-123", source="user_upload")]
+        )
+        assert "🖼️ **Image attached**" in out
+        assert "`file-123`" in out
+        assert "user_upload" in out
+        assert "content not preserved" in out
+
+    def test_file_placeholder_with_metadata(self):
+        out = render_blocks_to_markdown(
+            [make_file_placeholder(ref="sediment://x", mime="audio/wav", size_bytes=10240, duration_seconds=2.5)]
+        )
+        assert "📎 **File attached**" in out
+        assert "audio/wav" in out
+        assert "KB" in out
+        assert "2.50s" in out
+
+    def test_unknown_block_renders_with_keys(self):
+        out = render_blocks_to_markdown(
+            [
+                make_unknown_block(
+                    raw_type="future_x",
+                    observed_keys=["foo", "bar"],
+                    reason=UNKNOWN_REASON_UNKNOWN_TYPE,
+                )
+            ]
+        )
+        assert "⚠️ **Unsupported content**" in out
+        assert "future_x" in out
+        assert "`foo`" in out
+        assert "`bar`" in out
+
+    def test_unknown_extraction_failed_includes_summary(self):
+        out = render_blocks_to_markdown(
+            [
+                make_unknown_block(
+                    raw_type="audio_transcription",
+                    observed_keys=["asset_pointer"],
+                    reason=UNKNOWN_REASON_EXTRACTION_FAILED,
+                    summary="expected key 'text' not found",
+                )
+            ]
+        )
+        assert "extraction_failed" in out
+        assert "expected key 'text' not found" in out
+
+    def test_hidden_context_marker(self):
+        out = render_blocks_to_markdown(
+            [make_hidden_context_marker("user_editable_context")]
+        )
+        assert "ℹ️ **Hidden context**" in out
+        assert "`user_editable_context`" in out
+
+    def test_safe_fence_prevents_runaway_code_block(self):
+        # Content contains an unbalanced opening fence — without _safe_fence
+        # this would corrupt downstream rendering.
+        evil_content = "before\n```Follow\ntext\nraw is: \"```"
+        block = make_code_block(evil_content)
+        out = render_blocks_to_markdown([block, make_text_block("after")])
+        # The 4-backtick wrap should be present
+        assert "````" in out
+        # The "after" text should appear OUTSIDE any code block — it follows
+        # the closing ```` fence.
+        assert out.endswith("after")
+
+    def test_block_order_preserved(self):
+        blocks = [
+            make_text_block("a"),
+            make_image_placeholder(ref="r1", source="user_upload"),
+            make_text_block("b"),
+        ]
+        out = render_blocks_to_markdown(blocks)
+        assert out.index("a") < out.index("Image attached")
+        assert out.index("Image attached") < out.index("b")
+
+
+# ---------------------------------------------------------------------------
+# Markdown exporter with blocks
+# ---------------------------------------------------------------------------
+
+
+SAMPLE_CONV_BLOCKS = {
+    "id": "blocks12345",
+    "title": "Blocks Conversation",
+    "provider": "claude",
+    "project": None,
+    "created_at": "2024-06-10T14:32:00Z",
+    "updated_at": "2024-06-10T15:00:00Z",
+    "message_count": 1,
+    "messages": [
+        {
+            "role": "assistant",
+            "content_type": "text",
+            "timestamp": None,
+            "blocks": [
+                {"type": "text", "text": "Here is the answer."},
+                {"type": "tool_use", "name": "search", "input": {"q": "x"}, "tool_id": "t1"},
+            ],
+        }
+    ],
+}
+
+
+class TestMarkdownExporterWithBlocks:
+    def test_renders_blocks(self, tmp_path):
+        exp = MarkdownExporter(tmp_path)
+        path = exp.export(SAMPLE_CONV_BLOCKS)
+        body = path.read_text()
+        assert "Here is the answer." in body
+        assert "🔧 **Tool: search**" in body
+
+    def test_falls_back_to_content_when_blocks_missing(self, tmp_path):
+        # Backward-compat: messages with `content` only (no `blocks`) still render.
+        exp = MarkdownExporter(tmp_path)
+        path = exp.export(SAMPLE_CONV)  # SAMPLE_CONV has content only, no blocks
+        body = path.read_text()
+        assert "Hello, how are you?" in body
+
+    def test_skips_messages_with_neither_blocks_nor_content(self, tmp_path):
+        conv = {
+            **SAMPLE_CONV_BLOCKS,
+            "messages": [
+                {"role": "user", "content_type": "text", "timestamp": None, "blocks": []},
+                {"role": "assistant", "content_type": "text", "timestamp": None, "blocks": [
+                    {"type": "text", "text": "I am here."}
+                ]},
+            ],
+        }
+        exp = MarkdownExporter(tmp_path)
+        path = exp.export(conv)
+        body = path.read_text()
+        assert "I am here." in body
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -1,19 +1,52 @@
 """Unit tests for src/providers/ using fixture files."""

 import json
+import logging
 from pathlib import Path

 import pytest

+from src.blocks import (
+    BLOCK_TYPE_FILE_PLACEHOLDER,
+    BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
+    BLOCK_TYPE_IMAGE_PLACEHOLDER,
+    BLOCK_TYPE_TEXT,
+    BLOCK_TYPE_THINKING,
+    BLOCK_TYPE_TOOL_RESULT,
+    BLOCK_TYPE_TOOL_USE,
+    BLOCK_TYPE_UNKNOWN,
+)
+from src.loss_report import LossReport
+
 FIXTURES = Path(__file__).parent / "fixtures"


+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _block_types(message: dict) -> list[str]:
+    return [b.get("type") for b in (message.get("blocks") or [])]
+
+
+def _first_block(message: dict, block_type: str) -> dict | None:
+    for b in message.get("blocks") or []:
+        if b.get("type") == block_type:
+            return b
+    return None
+
+
+# ---------------------------------------------------------------------------
+# ChatGPT
+# ---------------------------------------------------------------------------
+
+
 class TestChatGPTNormalization:
-    """Test ChatGPTProvider.normalize_conversation() using fixture data."""
+    """ChatGPT normalize_conversation block-extraction behavior."""

    def _get_provider(self):
        from src.providers.chatgpt import ChatGPTProvider
-        # Bypass __init__ token check
        p = ChatGPTProvider.__new__(ChatGPTProvider)
        import requests
        p._session = requests.Session()
@@ -31,7 +64,6 @@ class TestChatGPTNormalization:
        assert result["id"] == "chatgpt-conv-001"
        assert result["title"] == "Python Async Tutorial"
        assert result["provider"] == "chatgpt"
-        # No entry in _project_map → project is None
        assert result["project"] is None
        assert result["created_at"] != ""
        assert result["updated_at"] != ""
@@ -46,7 +78,6 @@ class TestChatGPTNormalization:
        assert result["id"] == "chatgpt-conv-002"

    def test_normalizes_with_project_from_map(self):
-        """Project name from _project_map (populated by fetch_all_conversations) flows through."""
        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
        p = self._get_provider()
        p._project_map["chatgpt-conv-001"] = "My Research Project"
@@ -54,32 +85,167 @@ class TestChatGPTNormalization:

        assert result["project"] == "My Research Project"

-    def test_extracts_text_messages(self):
+    def test_text_message_emits_text_block(self):
        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
        p = self._get_provider()
        result = p.normalize_conversation(raw)

-        assert len(result["messages"]) >= 2
        user_msgs = [m for m in result["messages"] if m["role"] == "user"]
-        assert any("async" in m["content"].lower() for m in user_msgs)
+        # The "How does async/await..." message
+        async_msgs = [
+            m for m in user_msgs
+            if any(
+                "async" in (b.get("text") or "").lower()
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert async_msgs, "expected a user message about async/await"
+        assert _block_types(async_msgs[0]) == [BLOCK_TYPE_TEXT]

-    def test_skips_non_text_content_with_warning(self, caplog):
-        import logging
+    def test_code_block_preserved_with_language(self):
        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
        p = self._get_provider()
-        with caplog.at_level(logging.WARNING):
-            result = p.normalize_conversation(raw)
-        # The fixture has an image_asset_pointer node — should be warned about
-        assert any(
-            "image_asset_pointer" in r.message or "rich content" in r.message
-            for r in caplog.records
-        )
+        result = p.normalize_conversation(raw)

-    def test_model_editable_context_included_without_warning(self, caplog):
-        """model_editable_context messages (project instructions) should be included, not warned about."""
-        import logging
-        conv = {
-            "id": "test-conv-mec",
+        assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
+        # The first assistant message is the async/await answer with a python fence
+        text_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TEXT)
+        assert text_block is not None
+        assert "```python" in text_block["text"]
+
+    def test_multimodal_voice_user_message(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        # node-mm-user: audio_transcription "What is the capital of France?"
+        # + real_time_user_audio_video_asset_pointer wrapping a sediment:// URL
+        capital_msgs = [
+            m for m in result["messages"]
+            if any(
+                "capital of france" in (b.get("text") or "").lower()
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert capital_msgs, "expected the audio_transcription text to surface"
+        types = _block_types(capital_msgs[0])
+        assert BLOCK_TYPE_TEXT in types
+        assert BLOCK_TYPE_FILE_PLACEHOLDER in types
+
+        file_block = _first_block(capital_msgs[0], BLOCK_TYPE_FILE_PLACEHOLDER)
+        assert file_block["ref"].startswith("sediment://")
+        assert file_block["mime"] == "audio/wav"
+        assert file_block["size_bytes"] == 50000
+        assert file_block["duration_seconds"] == pytest.approx(2.5)
+
+    def test_multimodal_voice_reverse_order_preserved(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        # node-mm-user-rev has parts in REVERSE order: asset first, transcription second.
+        rev_msgs = [
+            m for m in result["messages"]
+            if any(
+                "tell me more" in (b.get("text") or "").lower()
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert rev_msgs, "expected the reverse-order voice message"
+        types = _block_types(rev_msgs[0])
+        # Order preserved: file_placeholder before text
+        assert types == [BLOCK_TYPE_FILE_PLACEHOLDER, BLOCK_TYPE_TEXT]
+
+    def test_image_only_user_message_renders(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        image_msgs = [
+            m for m in result["messages"]
+            if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or []))
+        ]
+        assert image_msgs, "image-only user message should now render"
+
+    def test_user_editable_context_emits_blocks(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        # The user_editable_context message has user_profile + user_instructions.
+        # It should now appear (was silently dropped pre-v0.4.0).
+        uec_msgs = [
+            m for m in result["messages"]
+            if any(
+                "Custom Instructions" in (b.get("text") or "")
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert uec_msgs, "user_editable_context should be visible in output"
+        # Hidden context marker should be prepended.
+        assert uec_msgs[0]["blocks"][0]["type"] == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER
+
+    def test_user_editable_context_uses_safe_fence(self):
+        """The user_instructions value contains embedded triple-backticks; the rendered
+        Markdown must use a fence longer than 3 backticks so embedded fences are inert.
+        """
+        from src.blocks import render_blocks_to_markdown
+
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        uec_msgs = [
+            m for m in result["messages"]
+            if any(
+                "Custom Instructions" in (b.get("text") or "")
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert uec_msgs
+        rendered = render_blocks_to_markdown(uec_msgs[0]["blocks"])
+        # Content has ``` inside, so the wrap fence must be at least 4 backticks.
+        assert "````" in rendered, "expected a 4+ backtick safe-fence wrap"
+
+    def test_message_roles_are_valid(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+        for msg in result["messages"]:
+            assert msg["role"] in ("user", "assistant", "system", "tool")
+
+    def test_message_count_matches(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+        assert result["message_count"] == len(result["messages"])
+
+    def test_loss_report_records_messages(self):
+        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+        p = self._get_provider()
+        report = LossReport()
+        result = p.normalize_conversation(raw, report)
+        assert report.messages_rendered == len(result["messages"])
+        assert report.conversations == 1
+
+
+class TestChatGPTUnknownContent:
+    """Unrecognised content types should produce visible unknown blocks + WARNING + tally."""
+
+    def _get_provider(self):
+        from src.providers.chatgpt import ChatGPTProvider
+        p = ChatGPTProvider.__new__(ChatGPTProvider)
+        import requests
+        p._session = requests.Session()
+        p._org_id = None
+        p._project_ids = []
+        p._project_map = {}
+        p._project_name_cache = {}
+        return p
+
+    def _make_unknown_conv(self):
+        return {
+            "id": "test-unknown",
            "title": "Test",
            "create_time": 1700000000.0,
            "update_time": 1700000001.0,
@@ -91,46 +257,45 @@ class TestChatGPTNormalization:
                        "id": "msg1",
                        "author": {"role": "user"},
                        "content": {
-                            "content_type": "model_editable_context",
-                            "parts": ["These are the project instructions."],
+                            "content_type": "future_unknown_type_xyz",
+                            "some_field": "value",
                        },
-                        "create_time": 1700000001.0,
-                        "status": "finished_successfully",
                    },
                    "parent": "root",
                    "children": [],
                },
            },
        }
+
+    def test_unknown_content_type_produces_unknown_block(self):
+        p = self._get_provider()
+        result = p.normalize_conversation(self._make_unknown_conv())
+        assert any(
+            b.get("type") == BLOCK_TYPE_UNKNOWN
+            for m in result["messages"]
+            for b in (m.get("blocks") or [])
+        )
+
+    def test_unknown_content_type_logs_warning(self, caplog):
        p = self._get_provider()
        with caplog.at_level(logging.WARNING):
-            result = p.normalize_conversation(conv)
-        assert any(m["content"] == "These are the project instructions." for m in result["messages"])
-        assert not any("model_editable_context" in r.message for r in caplog.records)
+            p.normalize_conversation(self._make_unknown_conv())
+        assert any("future_unknown_type_xyz" in r.message for r in caplog.records)

-    def test_message_roles_are_valid(self):
-        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
+    def test_unknown_content_type_increments_loss_report(self):
        p = self._get_provider()
-        result = p.normalize_conversation(raw)
-        for msg in result["messages"]:
-            assert msg["role"] in ("user", "assistant", "system")
+        report = LossReport()
+        p.normalize_conversation(self._make_unknown_conv(), report)
+        assert report.unknown_blocks["future_unknown_type_xyz"] == 1

-    def test_message_count_matches(self):
-        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
-        p = self._get_provider()
-        result = p.normalize_conversation(raw)
-        assert result["message_count"] == len(result["messages"])

-    def test_code_fence_preserved(self):
-        raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text())
-        p = self._get_provider()
-        result = p.normalize_conversation(raw)
-        all_content = " ".join(m["content"] for m in result["messages"])
-        assert "```python" in all_content
+# ---------------------------------------------------------------------------
+# Claude
+# ---------------------------------------------------------------------------


 class TestClaudeNormalization:
-    """Test ClaudeProvider.normalize_conversation() using fixture data."""
+    """Claude normalize_conversation block-extraction behavior."""

    def _get_provider(self):
        from src.providers.claude import ClaudeProvider
@@ -150,55 +315,138 @@ class TestClaudeNormalization:
        assert result["provider"] == "claude"
        assert result["project"] == "StarTOS Packaging"
        assert result["created_at"] == "2024-06-10T14:32:00.000Z"
-        assert isinstance(result["messages"], list)

    def test_normalizes_without_project(self):
        raw = json.loads((FIXTURES / "claude_no_project.json").read_text())
        p = self._get_provider()
        result = p.normalize_conversation(raw)
-
        assert result["project"] is None
-        assert result["id"] == "claude-conv-002"

-    def test_string_content_extracted(self):
-        raw = json.loads((FIXTURES / "claude_no_project.json").read_text())
+    def test_string_content_emits_text_block(self):
+        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
        p = self._get_provider()
        result = p.normalize_conversation(raw)

-        assert any("Docker" in m["content"] for m in result["messages"])
+        thanks_msgs = [
+            m for m in result["messages"]
+            if any(
+                "thank you" in (b.get("text") or "").lower()
+                for b in (m.get("blocks") or [])
+            )
+        ]
+        assert thanks_msgs

-    def test_list_content_extracted(self):
+    def test_list_content_emits_blocks_in_order(self):
        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
        p = self._get_provider()
        result = p.normalize_conversation(raw)

        assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
-        assert any("manifest" in m["content"].lower() for m in assistant_msgs)
+        # msg-002 has text + tool_use, in that order.
+        assert assistant_msgs
+        types = _block_types(assistant_msgs[0])
+        assert BLOCK_TYPE_TEXT in types
+        assert BLOCK_TYPE_TOOL_USE in types
+        # Order preserved
+        assert types.index(BLOCK_TYPE_TEXT) < types.index(BLOCK_TYPE_TOOL_USE)

-    def test_non_text_blocks_skipped_with_warning(self, caplog):
-        import logging
+    def test_tool_use_block_fields(self):
        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
        p = self._get_provider()
-        with caplog.at_level(logging.WARNING):
-            result = p.normalize_conversation(raw)
-        # The fixture has a tool_use block — should warn
+        result = p.normalize_conversation(raw)
+
+        assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"]
+        tool_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TOOL_USE)
+        assert tool_block["name"] == "search"
+        assert tool_block["input"] == {"query": "startOS docs"}
+        assert tool_block["tool_id"] == "tool-001"
+
+    def test_image_block_emits_image_placeholder(self):
+        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
+        p = self._get_provider()
+        result = p.normalize_conversation(raw)
+
+        msg004 = [
+            m for m in result["messages"]
+            if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or []))
+        ]
+        assert msg004
+        img = _first_block(msg004[0], BLOCK_TYPE_IMAGE_PLACEHOLDER)
+        assert img["ref"] == "claude-image-uuid-1"
+
+    def test_unknown_block_type_records_loss(self):
+        from src.blocks import BLOCK_TYPE_UNKNOWN as _UNK
+        raw = {
+            "uuid": "test-unknown",
+            "name": "T",
+            "chat_messages": [
+                {
+                    "uuid": "m1",
+                    "sender": "human",
+                    "content": [{"type": "future_block_xyz", "data": "..."}],
+                }
+            ],
+        }
+        p = self._get_provider()
+        report = LossReport()
+        result = p.normalize_conversation(raw, report)
        assert any(
-            "tool_use" in r.message or "rich content" in r.message
-            for r in caplog.records
+            b.get("type") == _UNK
+            for m in result["messages"]
+            for b in (m.get("blocks") or [])
        )
+        assert report.unknown_blocks["future_block_xyz"] == 1

-    def test_message_count_matches(self):
-        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
+    def test_thinking_block(self):
+        raw = {
+            "uuid": "thinking-test",
+            "name": "T",
+            "chat_messages": [
+                {
+                    "uuid": "m1",
+                    "sender": "assistant",
+                    "content": [
+                        {"type": "thinking", "thinking": "Let me reason about this."},
+                        {"type": "text", "text": "Here's the answer."},
+                    ],
+                }
+            ],
+        }
        p = self._get_provider()
        result = p.normalize_conversation(raw)
-        assert result["message_count"] == len(result["messages"])
+        types = _block_types(result["messages"][0])
+        assert BLOCK_TYPE_THINKING in types
+        assert BLOCK_TYPE_TEXT in types

-    def test_roles_normalized(self):
-        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
+    def test_tool_result_with_nested_text_blocks(self):
+        raw = {
+            "uuid": "tool-result-test",
+            "name": "T",
+            "chat_messages": [
+                {
+                    "uuid": "m1",
+                    "sender": "assistant",
+                    "content": [
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": "tool-001",
+                            "content": [
+                                {"type": "text", "text": "search hit 1"},
+                                {"type": "text", "text": "search hit 2"},
+                            ],
+                            "is_error": False,
+                        }
+                    ],
+                }
+            ],
+        }
        p = self._get_provider()
        result = p.normalize_conversation(raw)
-        for msg in result["messages"]:
-            assert msg["role"] in ("user", "assistant", "system")
+        tool_result = _first_block(result["messages"][0], BLOCK_TYPE_TOOL_RESULT)
+        assert tool_result is not None
+        assert "search hit 1" in tool_result["output"]
+        assert "search hit 2" in tool_result["output"]
+        assert tool_result["is_error"] is False

    def test_human_sender_maps_to_user(self):
        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
@@ -207,3 +455,10 @@ class TestClaudeNormalization:
        roles = {m["role"] for m in result["messages"]}
        assert "user" in roles
        assert "human" not in roles
+
+    def test_loss_report_messages_recorded(self):
+        raw = json.loads((FIXTURES / "claude_conversation.json").read_text())
+        p = self._get_provider()
+        report = LossReport()
+        result = p.normalize_conversation(raw, report)
+        assert report.messages_rendered == len(result["messages"])