feat: v0.4.0 — rich content support with typed blocks and loss visibility

Extracts per-message content into a typed `blocks` list (text, code, thinking, tool_use, tool_result, image_placeholder, file_placeholder, unknown) and renders them at exporter write time. Voice transcripts, Custom Instructions, and image references now appear in exports instead of being silently dropped. Foundation: - src/blocks.py: pure block constructors, _safe_fence (fence-corruption defense, verified live in Joplin), _blockquote_prefix, render - src/loss_report.py: per-run tally surfaced as INFO summary at end of export so silently-dropped data becomes visible Providers: - ChatGPT: dispatch on content_type produces typed blocks; voice shapes (audio_transcription, audio_asset_pointer, real_time_user_audio_video_ asset_pointer) locked from live DevTools capture; Custom Instructions bug fix (parts-vs-direct-fields); role filter lifted; hidden-context marker driven by is_visually_hidden_from_conversation flag - Claude: defensive dispatch for text/thinking/tool_use/tool_result/image with recursive nested-block flattening; untested against real rich- content data — fix-forward in v0.4.1 Exporter: - Markdown renders from blocks at write time via render_blocks_to_markdown; backward-compat fallback to content for any pre-v0.4.0 cached data Tests: - 27 new tests across providers, exporters, CLI; fixtures rebuilt with real-shape ChatGPT voice + Custom Instructions cases - 181/181 pass Behavior changes (intentional): - JSON output omits content; consumers should read blocks - Per-conversation message counts increase (Custom Instructions, image- only, tool-only messages now appear) - Existing exports not auto-re-rendered; users wanting fresh output run cache --clear then export Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 23:17:18 -04:00
parent 4798edcea7
commit 473d02f71a
16 changed files with 1786 additions and 232 deletions
--- a/src/blocks.py
+++ b/src/blocks.py
@@ -0,0 +1,322 @@
+"""Typed content blocks for normalized messages.
+
+Providers produce ordered lists of blocks; exporters render them. Living outside
+``src/providers/`` deliberately — blocks are a separate concern from extraction
+or rendering, shared by both layers.
+
+Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants.
+Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown``
+block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally
+at the call site — see plan §Data-loss visibility.
+"""
+
+import json
+from typing import Any
+
+BLOCK_TYPE_TEXT = "text"
+BLOCK_TYPE_CODE = "code"
+BLOCK_TYPE_THINKING = "thinking"
+BLOCK_TYPE_TOOL_USE = "tool_use"
+BLOCK_TYPE_TOOL_RESULT = "tool_result"
+BLOCK_TYPE_CITATION = "citation"
+BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder"
+BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder"
+BLOCK_TYPE_UNKNOWN = "unknown"
+BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker"
+
+UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type"
+UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed"
+UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed"
+UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type"
+
+_OBSERVED_KEYS_LIMIT = 10
+
+
+# ---------------------------------------------------------------------------
+# Constructors
+# ---------------------------------------------------------------------------
+
+
+def make_text_block(text: str) -> dict | None:
+    """Return a text block, or None if the text is empty/whitespace-only.
+
+    Returning None lets callers do ``if block: blocks.append(block)`` and prune
+    empty blocks at construction time. See plan §Finalizing the message dict.
+    """
+    if not isinstance(text, str) or not text.strip():
+        return None
+    return {"type": BLOCK_TYPE_TEXT, "text": text}
+
+
+def make_code_block(code: str, language: str = "") -> dict | None:
+    """Return a code block, or None if code is empty."""
+    if not isinstance(code, str) or not code.strip():
+        return None
+    return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code}
+
+
+def make_thinking_block(text: str) -> dict | None:
+    """Return a thinking block, or None if empty."""
+    if not isinstance(text, str) or not text.strip():
+        return None
+    return {"type": BLOCK_TYPE_THINKING, "text": text}
+
+
+def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict:
+    """Return a tool_use block.
+
+    Always returns a block (no None) — tool calls are meaningful even with
+    empty inputs.
+    """
+    return {
+        "type": BLOCK_TYPE_TOOL_USE,
+        "name": name or "",
+        "input": input_data if input_data is not None else {},
+        "tool_id": tool_id,
+    }
+
+
+def make_tool_result_block(
+    output: str,
+    tool_name: str | None = None,
+    is_error: bool = False,
+) -> dict:
+    """Return a tool_result block."""
+    return {
+        "type": BLOCK_TYPE_TOOL_RESULT,
+        "tool_name": tool_name,
+        "output": output if isinstance(output, str) else str(output),
+        "is_error": bool(is_error),
+    }
+
+
+def make_citation_block(
+    url: str,
+    title: str | None = None,
+    snippet: str | None = None,
+) -> dict | None:
+    if not url:
+        return None
+    return {
+        "type": BLOCK_TYPE_CITATION,
+        "url": url,
+        "title": title,
+        "snippet": snippet,
+    }
+
+
+def make_image_placeholder(
+    ref: str,
+    source: str = "unknown",
+    mime: str | None = None,
+) -> dict:
+    """source ∈ {'user_upload', 'model_generated', 'unknown'}."""
+    return {
+        "type": BLOCK_TYPE_IMAGE_PLACEHOLDER,
+        "ref": ref or "",
+        "source": source,
+        "mime": mime,
+    }
+
+
+def make_file_placeholder(
+    ref: str,
+    filename: str | None = None,
+    mime: str | None = None,
+    size_bytes: int | None = None,
+    duration_seconds: float | None = None,
+) -> dict:
+    return {
+        "type": BLOCK_TYPE_FILE_PLACEHOLDER,
+        "ref": ref or "",
+        "filename": filename,
+        "mime": mime,
+        "size_bytes": size_bytes,
+        "duration_seconds": duration_seconds,
+    }
+
+
+def make_unknown_block(
+    raw_type: str,
+    observed_keys: list[str] | None = None,
+    reason: str = UNKNOWN_REASON_UNKNOWN_TYPE,
+    summary: str | None = None,
+) -> dict:
+    """Construct an unknown block.
+
+    Every call site MUST also emit a WARNING log and increment a LossReport
+    tally — see plan §Data-loss visibility. The block surfaces the loss at
+    read time; the WARNING surfaces it at run time. Both signals matter.
+    """
+    keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT]
+    return {
+        "type": BLOCK_TYPE_UNKNOWN,
+        "raw_type": raw_type,
+        "observed_keys": keys,
+        "reason": reason,
+        "summary": summary,
+    }
+
+
+def make_hidden_context_marker(content_type: str) -> dict:
+    """A short prepend block that flags the surrounding message as hidden context.
+
+    Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by
+    content_type matching. The marker tells the reader "this message is
+    hidden in the source UI; we're showing it here for archival fidelity."
+    """
+    return {
+        "type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
+        "content_type": content_type or "",
+    }
+
+
+# ---------------------------------------------------------------------------
+# Rendering
+# ---------------------------------------------------------------------------
+
+
+def render_blocks_to_markdown(blocks: list[dict]) -> str:
+    """Render an ordered list of blocks to a single Markdown string.
+
+    Blocks are joined with one blank line between them. Pure function; no I/O.
+    """
+    if not blocks:
+        return ""
+
+    rendered: list[str] = []
+    for block in blocks:
+        chunk = _render_one(block)
+        if chunk:
+            rendered.append(chunk)
+
+    return "\n\n".join(rendered)
+
+
+def _render_one(block: dict) -> str:
+    btype = block.get("type", "")
+    if btype == BLOCK_TYPE_TEXT:
+        return block.get("text", "")
+    if btype == BLOCK_TYPE_CODE:
+        lang = block.get("language") or ""
+        code = block.get("code", "")
+        fence = _safe_fence(code)
+        return f"{fence}{lang}\n{code}\n{fence}"
+    if btype == BLOCK_TYPE_THINKING:
+        text = block.get("text", "")
+        quoted = _blockquote_prefix(text)
+        return f"**💭 Reasoning**\n\n{quoted}"
+    if btype == BLOCK_TYPE_TOOL_USE:
+        name = block.get("name", "")
+        input_data = block.get("input", {})
+        body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False)
+        fence = _safe_fence(body_json)
+        body = f"{fence}json\n{body_json}\n{fence}"
+        quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}")
+        return quoted
+    if btype == BLOCK_TYPE_TOOL_RESULT:
+        output = block.get("output", "")
+        is_error = bool(block.get("is_error"))
+        header = "❌ **Result (error)**" if is_error else "📤 **Result**"
+        fence = _safe_fence(output)
+        body = f"{fence}\n{output}\n{fence}"
+        return _blockquote_prefix(f"{header}\n{body}")
+    if btype == BLOCK_TYPE_CITATION:
+        url = block.get("url", "")
+        title = block.get("title") or url
+        return f"[{title}]({url})"
+    if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER:
+        ref = block.get("ref", "")
+        source = block.get("source", "unknown")
+        mime = block.get("mime")
+        meta_parts = [source] if source else []
+        if mime:
+            meta_parts.append(mime)
+        meta_parts.append("content not preserved in this export")
+        meta = ", ".join(meta_parts)
+        return f"> 🖼️ **Image attached** — `{ref}` ({meta})"
+    if btype == BLOCK_TYPE_FILE_PLACEHOLDER:
+        ref = block.get("ref", "")
+        filename = block.get("filename")
+        label = filename or ref
+        mime = block.get("mime")
+        size_bytes = block.get("size_bytes")
+        duration = block.get("duration_seconds")
+        meta_parts: list[str] = []
+        if mime:
+            meta_parts.append(mime)
+        if isinstance(size_bytes, int) and size_bytes > 0:
+            kb = size_bytes / 1024
+            meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB")
+        if isinstance(duration, (int, float)) and duration > 0:
+            meta_parts.append(f"{duration:.2f}s")
+        meta_parts.append("content not preserved in this export")
+        meta = ", ".join(meta_parts)
+        return f"> 📎 **File attached** — `{label}` ({meta})"
+    if btype == BLOCK_TYPE_UNKNOWN:
+        raw_type = block.get("raw_type", "?")
+        reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE)
+        keys = block.get("observed_keys") or []
+        summary = block.get("summary")
+        first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})"
+        lines = [first_line]
+        if summary:
+            lines.append(summary)
+        if keys:
+            keys_str = ", ".join(f"`{k}`" for k in keys)
+            lines.append(f"Keys observed: {keys_str}")
+        return _blockquote_prefix("\n".join(lines))
+    if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER:
+        ctype = block.get("content_type", "")
+        return f"> ℹ️ **Hidden context** — `{ctype}`"
+
+    # Defensive: a block of unrecognised local type (shouldn't happen if
+    # constructors are used). Render as visible warning rather than dropping.
+    return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _safe_fence(text: str) -> str:
+    """Return a backtick fence longer than the longest run of backticks in ``text``.
+
+    CommonMark requires the closing fence to be at least as long as the opening
+    fence. Picking N+1 (where N = longest run in content) ensures the content's
+    own backticks are inert. Minimum is 3.
+
+    Verified live against Joplin during planning — see plan
+    §Backtick-corruption defense.
+    """
+    if not isinstance(text, str):
+        return "```"
+    longest_run = 0
+    current_run = 0
+    for ch in text:
+        if ch == "`":
+            current_run += 1
+            if current_run > longest_run:
+                longest_run = current_run
+        else:
+            current_run = 0
+    fence_len = max(3, longest_run + 1)
+    return "`" * fence_len
+
+
+def _blockquote_prefix(text: str) -> str:
+    """Prefix every line of ``text`` with ``> `` so the whole block renders as a quote.
+
+    Empty source lines become ``>`` (no trailing space) so blockquote continuity
+    is preserved without trailing-whitespace noise.
+    """
+    if not isinstance(text, str):
+        return ""
+    out_lines: list[str] = []
+    for line in text.split("\n"):
+        if line == "":
+            out_lines.append(">")
+        else:
+            out_lines.append(f"> {line}")
+    return "\n".join(out_lines)