Files
ai-chatexport/src/blocks.py
JesseMarkowitz 68e8d532be feat: v0.4.1 — ChatGPT tool-output content types and conv_id fix
First real-data export against v0.4.0 surfaced 66 unknown blocks across
three content types — captured live and added.

Added:
- execution_output (Code Interpreter / container.exec / python tool
  output) → tool_result block. output=content.text,
  tool_name=author.name, is_error=metadata.aggregate_result.status,
  summary=metadata.reasoning_title
- system_error → error tool_result with tool_name=author.name
- tether_browsing_display: spinner placeholders (empty result+summary)
  skip silently with DEBUG log; defensive populated-case branch maps
  to tool_result (untested in real data)
- tool_result block schema: optional `summary` field rendered as
  italic line between header and fence
- tool_result rendering: tool_name appears in header when present
  (e.g. `📤 Result: container.exec`); existing tool_name=None calls
  unchanged
- _ROLE_LABELS["tool"] = ("🔧 Tool", "tool")

Fixed:
- chatgpt.normalize_conversation reads `conversation_id` as fallback
  for `id`. Live API uses conversation_id; fixtures use id.
  Pre-fix: empty id in YAML frontmatter and missing context in
  WARNING logs.

Tests: 11 new (192 total, 0 failures). Fixture extended with 4
tool-output cases (execution_output success, empty execution_output
that should skip, system_error, tether_browsing_display spinner).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-05 09:25:55 -04:00

340 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Typed content blocks for normalized messages.
Providers produce ordered lists of blocks; exporters render them. Living outside
``src/providers/`` deliberately — blocks are a separate concern from extraction
or rendering, shared by both layers.
Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants.
Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown``
block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally
at the call site — see plan §Data-loss visibility.
"""
import json
from typing import Any
BLOCK_TYPE_TEXT = "text"
BLOCK_TYPE_CODE = "code"
BLOCK_TYPE_THINKING = "thinking"
BLOCK_TYPE_TOOL_USE = "tool_use"
BLOCK_TYPE_TOOL_RESULT = "tool_result"
BLOCK_TYPE_CITATION = "citation"
BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder"
BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder"
BLOCK_TYPE_UNKNOWN = "unknown"
BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker"
UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type"
UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed"
UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed"
UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type"
_OBSERVED_KEYS_LIMIT = 10
# ---------------------------------------------------------------------------
# Constructors
# ---------------------------------------------------------------------------
def make_text_block(text: str) -> dict | None:
"""Return a text block, or None if the text is empty/whitespace-only.
Returning None lets callers do ``if block: blocks.append(block)`` and prune
empty blocks at construction time. See plan §Finalizing the message dict.
"""
if not isinstance(text, str) or not text.strip():
return None
return {"type": BLOCK_TYPE_TEXT, "text": text}
def make_code_block(code: str, language: str = "") -> dict | None:
"""Return a code block, or None if code is empty."""
if not isinstance(code, str) or not code.strip():
return None
return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code}
def make_thinking_block(text: str) -> dict | None:
"""Return a thinking block, or None if empty."""
if not isinstance(text, str) or not text.strip():
return None
return {"type": BLOCK_TYPE_THINKING, "text": text}
def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict:
"""Return a tool_use block.
Always returns a block (no None) — tool calls are meaningful even with
empty inputs.
"""
return {
"type": BLOCK_TYPE_TOOL_USE,
"name": name or "",
"input": input_data if input_data is not None else {},
"tool_id": tool_id,
}
def make_tool_result_block(
output: str,
tool_name: str | None = None,
is_error: bool = False,
summary: str | None = None,
) -> dict:
"""Return a tool_result block.
``summary`` is an optional short human label rendered between header and
fence (e.g. ChatGPT's ``metadata.reasoning_title`` for execution_output).
"""
return {
"type": BLOCK_TYPE_TOOL_RESULT,
"tool_name": tool_name,
"output": output if isinstance(output, str) else str(output),
"is_error": bool(is_error),
"summary": summary,
}
def make_citation_block(
url: str,
title: str | None = None,
snippet: str | None = None,
) -> dict | None:
if not url:
return None
return {
"type": BLOCK_TYPE_CITATION,
"url": url,
"title": title,
"snippet": snippet,
}
def make_image_placeholder(
ref: str,
source: str = "unknown",
mime: str | None = None,
) -> dict:
"""source ∈ {'user_upload', 'model_generated', 'unknown'}."""
return {
"type": BLOCK_TYPE_IMAGE_PLACEHOLDER,
"ref": ref or "",
"source": source,
"mime": mime,
}
def make_file_placeholder(
ref: str,
filename: str | None = None,
mime: str | None = None,
size_bytes: int | None = None,
duration_seconds: float | None = None,
) -> dict:
return {
"type": BLOCK_TYPE_FILE_PLACEHOLDER,
"ref": ref or "",
"filename": filename,
"mime": mime,
"size_bytes": size_bytes,
"duration_seconds": duration_seconds,
}
def make_unknown_block(
raw_type: str,
observed_keys: list[str] | None = None,
reason: str = UNKNOWN_REASON_UNKNOWN_TYPE,
summary: str | None = None,
) -> dict:
"""Construct an unknown block.
Every call site MUST also emit a WARNING log and increment a LossReport
tally — see plan §Data-loss visibility. The block surfaces the loss at
read time; the WARNING surfaces it at run time. Both signals matter.
"""
keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT]
return {
"type": BLOCK_TYPE_UNKNOWN,
"raw_type": raw_type,
"observed_keys": keys,
"reason": reason,
"summary": summary,
}
def make_hidden_context_marker(content_type: str) -> dict:
"""A short prepend block that flags the surrounding message as hidden context.
Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by
content_type matching. The marker tells the reader "this message is
hidden in the source UI; we're showing it here for archival fidelity."
"""
return {
"type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
"content_type": content_type or "",
}
# ---------------------------------------------------------------------------
# Rendering
# ---------------------------------------------------------------------------
def render_blocks_to_markdown(blocks: list[dict]) -> str:
"""Render an ordered list of blocks to a single Markdown string.
Blocks are joined with one blank line between them. Pure function; no I/O.
"""
if not blocks:
return ""
rendered: list[str] = []
for block in blocks:
chunk = _render_one(block)
if chunk:
rendered.append(chunk)
return "\n\n".join(rendered)
def _render_one(block: dict) -> str:
btype = block.get("type", "")
if btype == BLOCK_TYPE_TEXT:
return block.get("text", "")
if btype == BLOCK_TYPE_CODE:
lang = block.get("language") or ""
code = block.get("code", "")
fence = _safe_fence(code)
return f"{fence}{lang}\n{code}\n{fence}"
if btype == BLOCK_TYPE_THINKING:
text = block.get("text", "")
quoted = _blockquote_prefix(text)
return f"**💭 Reasoning**\n\n{quoted}"
if btype == BLOCK_TYPE_TOOL_USE:
name = block.get("name", "")
input_data = block.get("input", {})
body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False)
fence = _safe_fence(body_json)
body = f"{fence}json\n{body_json}\n{fence}"
quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}")
return quoted
if btype == BLOCK_TYPE_TOOL_RESULT:
output = block.get("output", "")
is_error = bool(block.get("is_error"))
tool_name = block.get("tool_name") or ""
summary = block.get("summary") or ""
icon = "" if is_error else "📤"
label = "Result (error)" if is_error else "Result"
if tool_name:
header = f"{icon} **{label}: {tool_name}**"
else:
header = f"{icon} **{label}**"
fence = _safe_fence(output)
body = f"{fence}\n{output}\n{fence}"
if summary:
inner = f"{header}\n*{summary}*\n{body}"
else:
inner = f"{header}\n{body}"
return _blockquote_prefix(inner)
if btype == BLOCK_TYPE_CITATION:
url = block.get("url", "")
title = block.get("title") or url
return f"[{title}]({url})"
if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER:
ref = block.get("ref", "")
source = block.get("source", "unknown")
mime = block.get("mime")
meta_parts = [source] if source else []
if mime:
meta_parts.append(mime)
meta_parts.append("content not preserved in this export")
meta = ", ".join(meta_parts)
return f"> 🖼️ **Image attached** — `{ref}` ({meta})"
if btype == BLOCK_TYPE_FILE_PLACEHOLDER:
ref = block.get("ref", "")
filename = block.get("filename")
label = filename or ref
mime = block.get("mime")
size_bytes = block.get("size_bytes")
duration = block.get("duration_seconds")
meta_parts: list[str] = []
if mime:
meta_parts.append(mime)
if isinstance(size_bytes, int) and size_bytes > 0:
kb = size_bytes / 1024
meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB")
if isinstance(duration, (int, float)) and duration > 0:
meta_parts.append(f"{duration:.2f}s")
meta_parts.append("content not preserved in this export")
meta = ", ".join(meta_parts)
return f"> 📎 **File attached** — `{label}` ({meta})"
if btype == BLOCK_TYPE_UNKNOWN:
raw_type = block.get("raw_type", "?")
reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE)
keys = block.get("observed_keys") or []
summary = block.get("summary")
first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})"
lines = [first_line]
if summary:
lines.append(summary)
if keys:
keys_str = ", ".join(f"`{k}`" for k in keys)
lines.append(f"Keys observed: {keys_str}")
return _blockquote_prefix("\n".join(lines))
if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER:
ctype = block.get("content_type", "")
return f"> **Hidden context** — `{ctype}`"
# Defensive: a block of unrecognised local type (shouldn't happen if
# constructors are used). Render as visible warning rather than dropping.
return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _safe_fence(text: str) -> str:
"""Return a backtick fence longer than the longest run of backticks in ``text``.
CommonMark requires the closing fence to be at least as long as the opening
fence. Picking N+1 (where N = longest run in content) ensures the content's
own backticks are inert. Minimum is 3.
Verified live against Joplin during planning — see plan
§Backtick-corruption defense.
"""
if not isinstance(text, str):
return "```"
longest_run = 0
current_run = 0
for ch in text:
if ch == "`":
current_run += 1
if current_run > longest_run:
longest_run = current_run
else:
current_run = 0
fence_len = max(3, longest_run + 1)
return "`" * fence_len
def _blockquote_prefix(text: str) -> str:
"""Prefix every line of ``text`` with ``> `` so the whole block renders as a quote.
Empty source lines become ``>`` (no trailing space) so blockquote continuity
is preserved without trailing-whitespace noise.
"""
if not isinstance(text, str):
return ""
out_lines: list[str] = []
for line in text.split("\n"):
if line == "":
out_lines.append(">")
else:
out_lines.append(f"> {line}")
return "\n".join(out_lines)