"""Typed content blocks for normalized messages. Providers produce ordered lists of blocks; exporters render them. Living outside ``src/providers/`` deliberately — blocks are a separate concern from extraction or rendering, shared by both layers. Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants. Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown`` block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally at the call site — see plan §Data-loss visibility. """ import json from typing import Any BLOCK_TYPE_TEXT = "text" BLOCK_TYPE_CODE = "code" BLOCK_TYPE_THINKING = "thinking" BLOCK_TYPE_TOOL_USE = "tool_use" BLOCK_TYPE_TOOL_RESULT = "tool_result" BLOCK_TYPE_CITATION = "citation" BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder" BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder" BLOCK_TYPE_UNKNOWN = "unknown" BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker" UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type" UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed" UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed" UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type" _OBSERVED_KEYS_LIMIT = 10 # --------------------------------------------------------------------------- # Constructors # --------------------------------------------------------------------------- def make_text_block(text: str) -> dict | None: """Return a text block, or None if the text is empty/whitespace-only. Returning None lets callers do ``if block: blocks.append(block)`` and prune empty blocks at construction time. See plan §Finalizing the message dict. """ if not isinstance(text, str) or not text.strip(): return None return {"type": BLOCK_TYPE_TEXT, "text": text} def make_code_block(code: str, language: str = "") -> dict | None: """Return a code block, or None if code is empty.""" if not isinstance(code, str) or not code.strip(): return None return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code} def make_thinking_block(text: str) -> dict | None: """Return a thinking block, or None if empty.""" if not isinstance(text, str) or not text.strip(): return None return {"type": BLOCK_TYPE_THINKING, "text": text} def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict: """Return a tool_use block. Always returns a block (no None) — tool calls are meaningful even with empty inputs. """ return { "type": BLOCK_TYPE_TOOL_USE, "name": name or "", "input": input_data if input_data is not None else {}, "tool_id": tool_id, } def make_tool_result_block( output: str, tool_name: str | None = None, is_error: bool = False, summary: str | None = None, ) -> dict: """Return a tool_result block. ``summary`` is an optional short human label rendered between header and fence (e.g. ChatGPT's ``metadata.reasoning_title`` for execution_output). """ return { "type": BLOCK_TYPE_TOOL_RESULT, "tool_name": tool_name, "output": output if isinstance(output, str) else str(output), "is_error": bool(is_error), "summary": summary, } def make_citation_block( url: str, title: str | None = None, snippet: str | None = None, ) -> dict | None: if not url: return None return { "type": BLOCK_TYPE_CITATION, "url": url, "title": title, "snippet": snippet, } def make_image_placeholder( ref: str, source: str = "unknown", mime: str | None = None, ) -> dict: """source ∈ {'user_upload', 'model_generated', 'unknown'}.""" return { "type": BLOCK_TYPE_IMAGE_PLACEHOLDER, "ref": ref or "", "source": source, "mime": mime, } def make_file_placeholder( ref: str, filename: str | None = None, mime: str | None = None, size_bytes: int | None = None, duration_seconds: float | None = None, ) -> dict: return { "type": BLOCK_TYPE_FILE_PLACEHOLDER, "ref": ref or "", "filename": filename, "mime": mime, "size_bytes": size_bytes, "duration_seconds": duration_seconds, } def make_unknown_block( raw_type: str, observed_keys: list[str] | None = None, reason: str = UNKNOWN_REASON_UNKNOWN_TYPE, summary: str | None = None, ) -> dict: """Construct an unknown block. Every call site MUST also emit a WARNING log and increment a LossReport tally — see plan §Data-loss visibility. The block surfaces the loss at read time; the WARNING surfaces it at run time. Both signals matter. """ keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT] return { "type": BLOCK_TYPE_UNKNOWN, "raw_type": raw_type, "observed_keys": keys, "reason": reason, "summary": summary, } def make_hidden_context_marker(content_type: str) -> dict: """A short prepend block that flags the surrounding message as hidden context. Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by content_type matching. The marker tells the reader "this message is hidden in the source UI; we're showing it here for archival fidelity." """ return { "type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER, "content_type": content_type or "", } # --------------------------------------------------------------------------- # Rendering # --------------------------------------------------------------------------- def render_blocks_to_markdown(blocks: list[dict]) -> str: """Render an ordered list of blocks to a single Markdown string. Blocks are joined with one blank line between them. Pure function; no I/O. """ if not blocks: return "" rendered: list[str] = [] for block in blocks: chunk = _render_one(block) if chunk: rendered.append(chunk) return "\n\n".join(rendered) def _render_one(block: dict) -> str: btype = block.get("type", "") if btype == BLOCK_TYPE_TEXT: return block.get("text", "") if btype == BLOCK_TYPE_CODE: lang = block.get("language") or "" code = block.get("code", "") fence = _safe_fence(code) return f"{fence}{lang}\n{code}\n{fence}" if btype == BLOCK_TYPE_THINKING: text = block.get("text", "") quoted = _blockquote_prefix(text) return f"**💭 Reasoning**\n\n{quoted}" if btype == BLOCK_TYPE_TOOL_USE: name = block.get("name", "") input_data = block.get("input", {}) body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False) fence = _safe_fence(body_json) body = f"{fence}json\n{body_json}\n{fence}" quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}") return quoted if btype == BLOCK_TYPE_TOOL_RESULT: output = block.get("output", "") is_error = bool(block.get("is_error")) tool_name = block.get("tool_name") or "" summary = block.get("summary") or "" icon = "❌" if is_error else "📤" label = "Result (error)" if is_error else "Result" if tool_name: header = f"{icon} **{label}: {tool_name}**" else: header = f"{icon} **{label}**" fence = _safe_fence(output) body = f"{fence}\n{output}\n{fence}" if summary: inner = f"{header}\n*{summary}*\n{body}" else: inner = f"{header}\n{body}" return _blockquote_prefix(inner) if btype == BLOCK_TYPE_CITATION: url = block.get("url", "") title = block.get("title") or url return f"[{title}]({url})" if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER: ref = block.get("ref", "") source = block.get("source", "unknown") mime = block.get("mime") meta_parts = [source] if source else [] if mime: meta_parts.append(mime) meta_parts.append("content not preserved in this export") meta = ", ".join(meta_parts) return f"> 🖼️ **Image attached** — `{ref}` ({meta})" if btype == BLOCK_TYPE_FILE_PLACEHOLDER: ref = block.get("ref", "") filename = block.get("filename") label = filename or ref mime = block.get("mime") size_bytes = block.get("size_bytes") duration = block.get("duration_seconds") meta_parts: list[str] = [] if mime: meta_parts.append(mime) if isinstance(size_bytes, int) and size_bytes > 0: kb = size_bytes / 1024 meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB") if isinstance(duration, (int, float)) and duration > 0: meta_parts.append(f"{duration:.2f}s") meta_parts.append("content not preserved in this export") meta = ", ".join(meta_parts) return f"> 📎 **File attached** — `{label}` ({meta})" if btype == BLOCK_TYPE_UNKNOWN: raw_type = block.get("raw_type", "?") reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE) keys = block.get("observed_keys") or [] summary = block.get("summary") first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})" lines = [first_line] if summary: lines.append(summary) if keys: keys_str = ", ".join(f"`{k}`" for k in keys) lines.append(f"Keys observed: {keys_str}") return _blockquote_prefix("\n".join(lines)) if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER: ctype = block.get("content_type", "") return f"> ℹ️ **Hidden context** — `{ctype}`" # Defensive: a block of unrecognised local type (shouldn't happen if # constructors are used). Render as visible warning rather than dropping. return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _safe_fence(text: str) -> str: """Return a backtick fence longer than the longest run of backticks in ``text``. CommonMark requires the closing fence to be at least as long as the opening fence. Picking N+1 (where N = longest run in content) ensures the content's own backticks are inert. Minimum is 3. Verified live against Joplin during planning — see plan §Backtick-corruption defense. """ if not isinstance(text, str): return "```" longest_run = 0 current_run = 0 for ch in text: if ch == "`": current_run += 1 if current_run > longest_run: longest_run = current_run else: current_run = 0 fence_len = max(3, longest_run + 1) return "`" * fence_len def _blockquote_prefix(text: str) -> str: """Prefix every line of ``text`` with ``> `` so the whole block renders as a quote. Empty source lines become ``>`` (no trailing space) so blockquote continuity is preserved without trailing-whitespace noise. """ if not isinstance(text, str): return "" out_lines: list[str] = [] for line in text.split("\n"): if line == "": out_lines.append(">") else: out_lines.append(f"> {line}") return "\n".join(out_lines)