feat: v0.4.0 — rich content support with typed blocks and loss visibility
Extracts per-message content into a typed `blocks` list (text, code, thinking, tool_use, tool_result, image_placeholder, file_placeholder, unknown) and renders them at exporter write time. Voice transcripts, Custom Instructions, and image references now appear in exports instead of being silently dropped. Foundation: - src/blocks.py: pure block constructors, _safe_fence (fence-corruption defense, verified live in Joplin), _blockquote_prefix, render - src/loss_report.py: per-run tally surfaced as INFO summary at end of export so silently-dropped data becomes visible Providers: - ChatGPT: dispatch on content_type produces typed blocks; voice shapes (audio_transcription, audio_asset_pointer, real_time_user_audio_video_ asset_pointer) locked from live DevTools capture; Custom Instructions bug fix (parts-vs-direct-fields); role filter lifted; hidden-context marker driven by is_visually_hidden_from_conversation flag - Claude: defensive dispatch for text/thinking/tool_use/tool_result/image with recursive nested-block flattening; untested against real rich- content data — fix-forward in v0.4.1 Exporter: - Markdown renders from blocks at write time via render_blocks_to_markdown; backward-compat fallback to content for any pre-v0.4.0 cached data Tests: - 27 new tests across providers, exporters, CLI; fixtures rebuilt with real-shape ChatGPT voice + Custom Instructions cases - 181/181 pass Behavior changes (intentional): - JSON output omits content; consumers should read blocks - Per-conversation message counts increase (Custom Instructions, image- only, tool-only messages now appear) - Existing exports not auto-re-rendered; users wanting fresh output run cache --clear then export Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
322
src/blocks.py
Normal file
322
src/blocks.py
Normal file
@@ -0,0 +1,322 @@
|
||||
"""Typed content blocks for normalized messages.
|
||||
|
||||
Providers produce ordered lists of blocks; exporters render them. Living outside
|
||||
``src/providers/`` deliberately — blocks are a separate concern from extraction
|
||||
or rendering, shared by both layers.
|
||||
|
||||
Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants.
|
||||
Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown``
|
||||
block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally
|
||||
at the call site — see plan §Data-loss visibility.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
BLOCK_TYPE_TEXT = "text"
|
||||
BLOCK_TYPE_CODE = "code"
|
||||
BLOCK_TYPE_THINKING = "thinking"
|
||||
BLOCK_TYPE_TOOL_USE = "tool_use"
|
||||
BLOCK_TYPE_TOOL_RESULT = "tool_result"
|
||||
BLOCK_TYPE_CITATION = "citation"
|
||||
BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder"
|
||||
BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder"
|
||||
BLOCK_TYPE_UNKNOWN = "unknown"
|
||||
BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker"
|
||||
|
||||
UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type"
|
||||
UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed"
|
||||
UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed"
|
||||
UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type"
|
||||
|
||||
_OBSERVED_KEYS_LIMIT = 10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constructors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def make_text_block(text: str) -> dict | None:
|
||||
"""Return a text block, or None if the text is empty/whitespace-only.
|
||||
|
||||
Returning None lets callers do ``if block: blocks.append(block)`` and prune
|
||||
empty blocks at construction time. See plan §Finalizing the message dict.
|
||||
"""
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
return None
|
||||
return {"type": BLOCK_TYPE_TEXT, "text": text}
|
||||
|
||||
|
||||
def make_code_block(code: str, language: str = "") -> dict | None:
|
||||
"""Return a code block, or None if code is empty."""
|
||||
if not isinstance(code, str) or not code.strip():
|
||||
return None
|
||||
return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code}
|
||||
|
||||
|
||||
def make_thinking_block(text: str) -> dict | None:
|
||||
"""Return a thinking block, or None if empty."""
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
return None
|
||||
return {"type": BLOCK_TYPE_THINKING, "text": text}
|
||||
|
||||
|
||||
def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict:
|
||||
"""Return a tool_use block.
|
||||
|
||||
Always returns a block (no None) — tool calls are meaningful even with
|
||||
empty inputs.
|
||||
"""
|
||||
return {
|
||||
"type": BLOCK_TYPE_TOOL_USE,
|
||||
"name": name or "",
|
||||
"input": input_data if input_data is not None else {},
|
||||
"tool_id": tool_id,
|
||||
}
|
||||
|
||||
|
||||
def make_tool_result_block(
|
||||
output: str,
|
||||
tool_name: str | None = None,
|
||||
is_error: bool = False,
|
||||
) -> dict:
|
||||
"""Return a tool_result block."""
|
||||
return {
|
||||
"type": BLOCK_TYPE_TOOL_RESULT,
|
||||
"tool_name": tool_name,
|
||||
"output": output if isinstance(output, str) else str(output),
|
||||
"is_error": bool(is_error),
|
||||
}
|
||||
|
||||
|
||||
def make_citation_block(
|
||||
url: str,
|
||||
title: str | None = None,
|
||||
snippet: str | None = None,
|
||||
) -> dict | None:
|
||||
if not url:
|
||||
return None
|
||||
return {
|
||||
"type": BLOCK_TYPE_CITATION,
|
||||
"url": url,
|
||||
"title": title,
|
||||
"snippet": snippet,
|
||||
}
|
||||
|
||||
|
||||
def make_image_placeholder(
|
||||
ref: str,
|
||||
source: str = "unknown",
|
||||
mime: str | None = None,
|
||||
) -> dict:
|
||||
"""source ∈ {'user_upload', 'model_generated', 'unknown'}."""
|
||||
return {
|
||||
"type": BLOCK_TYPE_IMAGE_PLACEHOLDER,
|
||||
"ref": ref or "",
|
||||
"source": source,
|
||||
"mime": mime,
|
||||
}
|
||||
|
||||
|
||||
def make_file_placeholder(
|
||||
ref: str,
|
||||
filename: str | None = None,
|
||||
mime: str | None = None,
|
||||
size_bytes: int | None = None,
|
||||
duration_seconds: float | None = None,
|
||||
) -> dict:
|
||||
return {
|
||||
"type": BLOCK_TYPE_FILE_PLACEHOLDER,
|
||||
"ref": ref or "",
|
||||
"filename": filename,
|
||||
"mime": mime,
|
||||
"size_bytes": size_bytes,
|
||||
"duration_seconds": duration_seconds,
|
||||
}
|
||||
|
||||
|
||||
def make_unknown_block(
|
||||
raw_type: str,
|
||||
observed_keys: list[str] | None = None,
|
||||
reason: str = UNKNOWN_REASON_UNKNOWN_TYPE,
|
||||
summary: str | None = None,
|
||||
) -> dict:
|
||||
"""Construct an unknown block.
|
||||
|
||||
Every call site MUST also emit a WARNING log and increment a LossReport
|
||||
tally — see plan §Data-loss visibility. The block surfaces the loss at
|
||||
read time; the WARNING surfaces it at run time. Both signals matter.
|
||||
"""
|
||||
keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT]
|
||||
return {
|
||||
"type": BLOCK_TYPE_UNKNOWN,
|
||||
"raw_type": raw_type,
|
||||
"observed_keys": keys,
|
||||
"reason": reason,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
|
||||
def make_hidden_context_marker(content_type: str) -> dict:
|
||||
"""A short prepend block that flags the surrounding message as hidden context.
|
||||
|
||||
Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by
|
||||
content_type matching. The marker tells the reader "this message is
|
||||
hidden in the source UI; we're showing it here for archival fidelity."
|
||||
"""
|
||||
return {
|
||||
"type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
|
||||
"content_type": content_type or "",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rendering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def render_blocks_to_markdown(blocks: list[dict]) -> str:
|
||||
"""Render an ordered list of blocks to a single Markdown string.
|
||||
|
||||
Blocks are joined with one blank line between them. Pure function; no I/O.
|
||||
"""
|
||||
if not blocks:
|
||||
return ""
|
||||
|
||||
rendered: list[str] = []
|
||||
for block in blocks:
|
||||
chunk = _render_one(block)
|
||||
if chunk:
|
||||
rendered.append(chunk)
|
||||
|
||||
return "\n\n".join(rendered)
|
||||
|
||||
|
||||
def _render_one(block: dict) -> str:
|
||||
btype = block.get("type", "")
|
||||
if btype == BLOCK_TYPE_TEXT:
|
||||
return block.get("text", "")
|
||||
if btype == BLOCK_TYPE_CODE:
|
||||
lang = block.get("language") or ""
|
||||
code = block.get("code", "")
|
||||
fence = _safe_fence(code)
|
||||
return f"{fence}{lang}\n{code}\n{fence}"
|
||||
if btype == BLOCK_TYPE_THINKING:
|
||||
text = block.get("text", "")
|
||||
quoted = _blockquote_prefix(text)
|
||||
return f"**💭 Reasoning**\n\n{quoted}"
|
||||
if btype == BLOCK_TYPE_TOOL_USE:
|
||||
name = block.get("name", "")
|
||||
input_data = block.get("input", {})
|
||||
body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False)
|
||||
fence = _safe_fence(body_json)
|
||||
body = f"{fence}json\n{body_json}\n{fence}"
|
||||
quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}")
|
||||
return quoted
|
||||
if btype == BLOCK_TYPE_TOOL_RESULT:
|
||||
output = block.get("output", "")
|
||||
is_error = bool(block.get("is_error"))
|
||||
header = "❌ **Result (error)**" if is_error else "📤 **Result**"
|
||||
fence = _safe_fence(output)
|
||||
body = f"{fence}\n{output}\n{fence}"
|
||||
return _blockquote_prefix(f"{header}\n{body}")
|
||||
if btype == BLOCK_TYPE_CITATION:
|
||||
url = block.get("url", "")
|
||||
title = block.get("title") or url
|
||||
return f"[{title}]({url})"
|
||||
if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER:
|
||||
ref = block.get("ref", "")
|
||||
source = block.get("source", "unknown")
|
||||
mime = block.get("mime")
|
||||
meta_parts = [source] if source else []
|
||||
if mime:
|
||||
meta_parts.append(mime)
|
||||
meta_parts.append("content not preserved in this export")
|
||||
meta = ", ".join(meta_parts)
|
||||
return f"> 🖼️ **Image attached** — `{ref}` ({meta})"
|
||||
if btype == BLOCK_TYPE_FILE_PLACEHOLDER:
|
||||
ref = block.get("ref", "")
|
||||
filename = block.get("filename")
|
||||
label = filename or ref
|
||||
mime = block.get("mime")
|
||||
size_bytes = block.get("size_bytes")
|
||||
duration = block.get("duration_seconds")
|
||||
meta_parts: list[str] = []
|
||||
if mime:
|
||||
meta_parts.append(mime)
|
||||
if isinstance(size_bytes, int) and size_bytes > 0:
|
||||
kb = size_bytes / 1024
|
||||
meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB")
|
||||
if isinstance(duration, (int, float)) and duration > 0:
|
||||
meta_parts.append(f"{duration:.2f}s")
|
||||
meta_parts.append("content not preserved in this export")
|
||||
meta = ", ".join(meta_parts)
|
||||
return f"> 📎 **File attached** — `{label}` ({meta})"
|
||||
if btype == BLOCK_TYPE_UNKNOWN:
|
||||
raw_type = block.get("raw_type", "?")
|
||||
reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE)
|
||||
keys = block.get("observed_keys") or []
|
||||
summary = block.get("summary")
|
||||
first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})"
|
||||
lines = [first_line]
|
||||
if summary:
|
||||
lines.append(summary)
|
||||
if keys:
|
||||
keys_str = ", ".join(f"`{k}`" for k in keys)
|
||||
lines.append(f"Keys observed: {keys_str}")
|
||||
return _blockquote_prefix("\n".join(lines))
|
||||
if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER:
|
||||
ctype = block.get("content_type", "")
|
||||
return f"> ℹ️ **Hidden context** — `{ctype}`"
|
||||
|
||||
# Defensive: a block of unrecognised local type (shouldn't happen if
|
||||
# constructors are used). Render as visible warning rather than dropping.
|
||||
return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _safe_fence(text: str) -> str:
|
||||
"""Return a backtick fence longer than the longest run of backticks in ``text``.
|
||||
|
||||
CommonMark requires the closing fence to be at least as long as the opening
|
||||
fence. Picking N+1 (where N = longest run in content) ensures the content's
|
||||
own backticks are inert. Minimum is 3.
|
||||
|
||||
Verified live against Joplin during planning — see plan
|
||||
§Backtick-corruption defense.
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return "```"
|
||||
longest_run = 0
|
||||
current_run = 0
|
||||
for ch in text:
|
||||
if ch == "`":
|
||||
current_run += 1
|
||||
if current_run > longest_run:
|
||||
longest_run = current_run
|
||||
else:
|
||||
current_run = 0
|
||||
fence_len = max(3, longest_run + 1)
|
||||
return "`" * fence_len
|
||||
|
||||
|
||||
def _blockquote_prefix(text: str) -> str:
|
||||
"""Prefix every line of ``text`` with ``> `` so the whole block renders as a quote.
|
||||
|
||||
Empty source lines become ``>`` (no trailing space) so blockquote continuity
|
||||
is preserved without trailing-whitespace noise.
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
out_lines: list[str] = []
|
||||
for line in text.split("\n"):
|
||||
if line == "":
|
||||
out_lines.append(">")
|
||||
else:
|
||||
out_lines.append(f"> {line}")
|
||||
return "\n".join(out_lines)
|
||||
Reference in New Issue
Block a user