First real-data export against v0.4.0 surfaced 66 unknown blocks across three content types — captured live and added. Added: - execution_output (Code Interpreter / container.exec / python tool output) → tool_result block. output=content.text, tool_name=author.name, is_error=metadata.aggregate_result.status, summary=metadata.reasoning_title - system_error → error tool_result with tool_name=author.name - tether_browsing_display: spinner placeholders (empty result+summary) skip silently with DEBUG log; defensive populated-case branch maps to tool_result (untested in real data) - tool_result block schema: optional `summary` field rendered as italic line between header and fence - tool_result rendering: tool_name appears in header when present (e.g. `📤 Result: container.exec`); existing tool_name=None calls unchanged - _ROLE_LABELS["tool"] = ("🔧 Tool", "tool") Fixed: - chatgpt.normalize_conversation reads `conversation_id` as fallback for `id`. Live API uses conversation_id; fixtures use id. Pre-fix: empty id in YAML frontmatter and missing context in WARNING logs. Tests: 11 new (192 total, 0 failures). Fixture extended with 4 tool-output cases (execution_output success, empty execution_output that should skip, system_error, tether_browsing_display spinner). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
340 lines
11 KiB
Python
340 lines
11 KiB
Python
"""Typed content blocks for normalized messages.
|
||
|
||
Providers produce ordered lists of blocks; exporters render them. Living outside
|
||
``src/providers/`` deliberately — blocks are a separate concern from extraction
|
||
or rendering, shared by both layers.
|
||
|
||
Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants.
|
||
Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown``
|
||
block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally
|
||
at the call site — see plan §Data-loss visibility.
|
||
"""
|
||
|
||
import json
|
||
from typing import Any
|
||
|
||
BLOCK_TYPE_TEXT = "text"
|
||
BLOCK_TYPE_CODE = "code"
|
||
BLOCK_TYPE_THINKING = "thinking"
|
||
BLOCK_TYPE_TOOL_USE = "tool_use"
|
||
BLOCK_TYPE_TOOL_RESULT = "tool_result"
|
||
BLOCK_TYPE_CITATION = "citation"
|
||
BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder"
|
||
BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder"
|
||
BLOCK_TYPE_UNKNOWN = "unknown"
|
||
BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker"
|
||
|
||
UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type"
|
||
UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed"
|
||
UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed"
|
||
UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type"
|
||
|
||
_OBSERVED_KEYS_LIMIT = 10
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constructors
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def make_text_block(text: str) -> dict | None:
|
||
"""Return a text block, or None if the text is empty/whitespace-only.
|
||
|
||
Returning None lets callers do ``if block: blocks.append(block)`` and prune
|
||
empty blocks at construction time. See plan §Finalizing the message dict.
|
||
"""
|
||
if not isinstance(text, str) or not text.strip():
|
||
return None
|
||
return {"type": BLOCK_TYPE_TEXT, "text": text}
|
||
|
||
|
||
def make_code_block(code: str, language: str = "") -> dict | None:
|
||
"""Return a code block, or None if code is empty."""
|
||
if not isinstance(code, str) or not code.strip():
|
||
return None
|
||
return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code}
|
||
|
||
|
||
def make_thinking_block(text: str) -> dict | None:
|
||
"""Return a thinking block, or None if empty."""
|
||
if not isinstance(text, str) or not text.strip():
|
||
return None
|
||
return {"type": BLOCK_TYPE_THINKING, "text": text}
|
||
|
||
|
||
def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict:
|
||
"""Return a tool_use block.
|
||
|
||
Always returns a block (no None) — tool calls are meaningful even with
|
||
empty inputs.
|
||
"""
|
||
return {
|
||
"type": BLOCK_TYPE_TOOL_USE,
|
||
"name": name or "",
|
||
"input": input_data if input_data is not None else {},
|
||
"tool_id": tool_id,
|
||
}
|
||
|
||
|
||
def make_tool_result_block(
|
||
output: str,
|
||
tool_name: str | None = None,
|
||
is_error: bool = False,
|
||
summary: str | None = None,
|
||
) -> dict:
|
||
"""Return a tool_result block.
|
||
|
||
``summary`` is an optional short human label rendered between header and
|
||
fence (e.g. ChatGPT's ``metadata.reasoning_title`` for execution_output).
|
||
"""
|
||
return {
|
||
"type": BLOCK_TYPE_TOOL_RESULT,
|
||
"tool_name": tool_name,
|
||
"output": output if isinstance(output, str) else str(output),
|
||
"is_error": bool(is_error),
|
||
"summary": summary,
|
||
}
|
||
|
||
|
||
def make_citation_block(
|
||
url: str,
|
||
title: str | None = None,
|
||
snippet: str | None = None,
|
||
) -> dict | None:
|
||
if not url:
|
||
return None
|
||
return {
|
||
"type": BLOCK_TYPE_CITATION,
|
||
"url": url,
|
||
"title": title,
|
||
"snippet": snippet,
|
||
}
|
||
|
||
|
||
def make_image_placeholder(
|
||
ref: str,
|
||
source: str = "unknown",
|
||
mime: str | None = None,
|
||
) -> dict:
|
||
"""source ∈ {'user_upload', 'model_generated', 'unknown'}."""
|
||
return {
|
||
"type": BLOCK_TYPE_IMAGE_PLACEHOLDER,
|
||
"ref": ref or "",
|
||
"source": source,
|
||
"mime": mime,
|
||
}
|
||
|
||
|
||
def make_file_placeholder(
|
||
ref: str,
|
||
filename: str | None = None,
|
||
mime: str | None = None,
|
||
size_bytes: int | None = None,
|
||
duration_seconds: float | None = None,
|
||
) -> dict:
|
||
return {
|
||
"type": BLOCK_TYPE_FILE_PLACEHOLDER,
|
||
"ref": ref or "",
|
||
"filename": filename,
|
||
"mime": mime,
|
||
"size_bytes": size_bytes,
|
||
"duration_seconds": duration_seconds,
|
||
}
|
||
|
||
|
||
def make_unknown_block(
|
||
raw_type: str,
|
||
observed_keys: list[str] | None = None,
|
||
reason: str = UNKNOWN_REASON_UNKNOWN_TYPE,
|
||
summary: str | None = None,
|
||
) -> dict:
|
||
"""Construct an unknown block.
|
||
|
||
Every call site MUST also emit a WARNING log and increment a LossReport
|
||
tally — see plan §Data-loss visibility. The block surfaces the loss at
|
||
read time; the WARNING surfaces it at run time. Both signals matter.
|
||
"""
|
||
keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT]
|
||
return {
|
||
"type": BLOCK_TYPE_UNKNOWN,
|
||
"raw_type": raw_type,
|
||
"observed_keys": keys,
|
||
"reason": reason,
|
||
"summary": summary,
|
||
}
|
||
|
||
|
||
def make_hidden_context_marker(content_type: str) -> dict:
|
||
"""A short prepend block that flags the surrounding message as hidden context.
|
||
|
||
Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by
|
||
content_type matching. The marker tells the reader "this message is
|
||
hidden in the source UI; we're showing it here for archival fidelity."
|
||
"""
|
||
return {
|
||
"type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
|
||
"content_type": content_type or "",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Rendering
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def render_blocks_to_markdown(blocks: list[dict]) -> str:
|
||
"""Render an ordered list of blocks to a single Markdown string.
|
||
|
||
Blocks are joined with one blank line between them. Pure function; no I/O.
|
||
"""
|
||
if not blocks:
|
||
return ""
|
||
|
||
rendered: list[str] = []
|
||
for block in blocks:
|
||
chunk = _render_one(block)
|
||
if chunk:
|
||
rendered.append(chunk)
|
||
|
||
return "\n\n".join(rendered)
|
||
|
||
|
||
def _render_one(block: dict) -> str:
|
||
btype = block.get("type", "")
|
||
if btype == BLOCK_TYPE_TEXT:
|
||
return block.get("text", "")
|
||
if btype == BLOCK_TYPE_CODE:
|
||
lang = block.get("language") or ""
|
||
code = block.get("code", "")
|
||
fence = _safe_fence(code)
|
||
return f"{fence}{lang}\n{code}\n{fence}"
|
||
if btype == BLOCK_TYPE_THINKING:
|
||
text = block.get("text", "")
|
||
quoted = _blockquote_prefix(text)
|
||
return f"**💭 Reasoning**\n\n{quoted}"
|
||
if btype == BLOCK_TYPE_TOOL_USE:
|
||
name = block.get("name", "")
|
||
input_data = block.get("input", {})
|
||
body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False)
|
||
fence = _safe_fence(body_json)
|
||
body = f"{fence}json\n{body_json}\n{fence}"
|
||
quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}")
|
||
return quoted
|
||
if btype == BLOCK_TYPE_TOOL_RESULT:
|
||
output = block.get("output", "")
|
||
is_error = bool(block.get("is_error"))
|
||
tool_name = block.get("tool_name") or ""
|
||
summary = block.get("summary") or ""
|
||
icon = "❌" if is_error else "📤"
|
||
label = "Result (error)" if is_error else "Result"
|
||
if tool_name:
|
||
header = f"{icon} **{label}: {tool_name}**"
|
||
else:
|
||
header = f"{icon} **{label}**"
|
||
fence = _safe_fence(output)
|
||
body = f"{fence}\n{output}\n{fence}"
|
||
if summary:
|
||
inner = f"{header}\n*{summary}*\n{body}"
|
||
else:
|
||
inner = f"{header}\n{body}"
|
||
return _blockquote_prefix(inner)
|
||
if btype == BLOCK_TYPE_CITATION:
|
||
url = block.get("url", "")
|
||
title = block.get("title") or url
|
||
return f"[{title}]({url})"
|
||
if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER:
|
||
ref = block.get("ref", "")
|
||
source = block.get("source", "unknown")
|
||
mime = block.get("mime")
|
||
meta_parts = [source] if source else []
|
||
if mime:
|
||
meta_parts.append(mime)
|
||
meta_parts.append("content not preserved in this export")
|
||
meta = ", ".join(meta_parts)
|
||
return f"> 🖼️ **Image attached** — `{ref}` ({meta})"
|
||
if btype == BLOCK_TYPE_FILE_PLACEHOLDER:
|
||
ref = block.get("ref", "")
|
||
filename = block.get("filename")
|
||
label = filename or ref
|
||
mime = block.get("mime")
|
||
size_bytes = block.get("size_bytes")
|
||
duration = block.get("duration_seconds")
|
||
meta_parts: list[str] = []
|
||
if mime:
|
||
meta_parts.append(mime)
|
||
if isinstance(size_bytes, int) and size_bytes > 0:
|
||
kb = size_bytes / 1024
|
||
meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB")
|
||
if isinstance(duration, (int, float)) and duration > 0:
|
||
meta_parts.append(f"{duration:.2f}s")
|
||
meta_parts.append("content not preserved in this export")
|
||
meta = ", ".join(meta_parts)
|
||
return f"> 📎 **File attached** — `{label}` ({meta})"
|
||
if btype == BLOCK_TYPE_UNKNOWN:
|
||
raw_type = block.get("raw_type", "?")
|
||
reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE)
|
||
keys = block.get("observed_keys") or []
|
||
summary = block.get("summary")
|
||
first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})"
|
||
lines = [first_line]
|
||
if summary:
|
||
lines.append(summary)
|
||
if keys:
|
||
keys_str = ", ".join(f"`{k}`" for k in keys)
|
||
lines.append(f"Keys observed: {keys_str}")
|
||
return _blockquote_prefix("\n".join(lines))
|
||
if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER:
|
||
ctype = block.get("content_type", "")
|
||
return f"> ℹ️ **Hidden context** — `{ctype}`"
|
||
|
||
# Defensive: a block of unrecognised local type (shouldn't happen if
|
||
# constructors are used). Render as visible warning rather than dropping.
|
||
return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _safe_fence(text: str) -> str:
|
||
"""Return a backtick fence longer than the longest run of backticks in ``text``.
|
||
|
||
CommonMark requires the closing fence to be at least as long as the opening
|
||
fence. Picking N+1 (where N = longest run in content) ensures the content's
|
||
own backticks are inert. Minimum is 3.
|
||
|
||
Verified live against Joplin during planning — see plan
|
||
§Backtick-corruption defense.
|
||
"""
|
||
if not isinstance(text, str):
|
||
return "```"
|
||
longest_run = 0
|
||
current_run = 0
|
||
for ch in text:
|
||
if ch == "`":
|
||
current_run += 1
|
||
if current_run > longest_run:
|
||
longest_run = current_run
|
||
else:
|
||
current_run = 0
|
||
fence_len = max(3, longest_run + 1)
|
||
return "`" * fence_len
|
||
|
||
|
||
def _blockquote_prefix(text: str) -> str:
|
||
"""Prefix every line of ``text`` with ``> `` so the whole block renders as a quote.
|
||
|
||
Empty source lines become ``>`` (no trailing space) so blockquote continuity
|
||
is preserved without trailing-whitespace noise.
|
||
"""
|
||
if not isinstance(text, str):
|
||
return ""
|
||
out_lines: list[str] = []
|
||
for line in text.split("\n"):
|
||
if line == "":
|
||
out_lines.append(">")
|
||
else:
|
||
out_lines.append(f"> {line}")
|
||
return "\n".join(out_lines)
|