ai-chatexport/src/blocks.py

"""Typed content blocks for normalized messages.

Providers produce ordered lists of blocks; exporters render them. Living outside
``src/providers/`` deliberately — blocks are a separate concern from extraction
or rendering, shared by both layers.

Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants.
Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown``
block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally
at the call site — see plan §Data-loss visibility.
"""

import json
from typing import Any

BLOCK_TYPE_TEXT = "text"
BLOCK_TYPE_CODE = "code"
BLOCK_TYPE_THINKING = "thinking"
BLOCK_TYPE_TOOL_USE = "tool_use"
BLOCK_TYPE_TOOL_RESULT = "tool_result"
BLOCK_TYPE_CITATION = "citation"
BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder"
BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder"
BLOCK_TYPE_UNKNOWN = "unknown"
BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker"

UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type"
UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed"
UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed"
UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type"

_OBSERVED_KEYS_LIMIT = 10


# ---------------------------------------------------------------------------
# Constructors
# ---------------------------------------------------------------------------


def make_text_block(text: str) -> dict | None:
    """Return a text block, or None if the text is empty/whitespace-only.

    Returning None lets callers do ``if block: blocks.append(block)`` and prune
    empty blocks at construction time. See plan §Finalizing the message dict.
    """
    if not isinstance(text, str) or not text.strip():
        return None
    return {"type": BLOCK_TYPE_TEXT, "text": text}


def make_code_block(code: str, language: str = "") -> dict | None:
    """Return a code block, or None if code is empty."""
    if not isinstance(code, str) or not code.strip():
        return None
    return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code}


def make_thinking_block(text: str) -> dict | None:
    """Return a thinking block, or None if empty."""
    if not isinstance(text, str) or not text.strip():
        return None
    return {"type": BLOCK_TYPE_THINKING, "text": text}


def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict:
    """Return a tool_use block.

    Always returns a block (no None) — tool calls are meaningful even with
    empty inputs.
    """
    return {
        "type": BLOCK_TYPE_TOOL_USE,
        "name": name or "",
        "input": input_data if input_data is not None else {},
        "tool_id": tool_id,
    }


def make_tool_result_block(
    output: str,
    tool_name: str | None = None,
    is_error: bool = False,
    summary: str | None = None,
) -> dict:
    """Return a tool_result block.

    ``summary`` is an optional short human label rendered between header and
    fence (e.g. ChatGPT's ``metadata.reasoning_title`` for execution_output).
    """
    return {
        "type": BLOCK_TYPE_TOOL_RESULT,
        "tool_name": tool_name,
        "output": output if isinstance(output, str) else str(output),
        "is_error": bool(is_error),
        "summary": summary,
    }


def make_citation_block(
    url: str,
    title: str | None = None,
    snippet: str | None = None,
) -> dict | None:
    if not url:
        return None
    return {
        "type": BLOCK_TYPE_CITATION,
        "url": url,
        "title": title,
        "snippet": snippet,
    }


def make_image_placeholder(
    ref: str,
    source: str = "unknown",
    mime: str | None = None,
) -> dict:
    """source ∈ {'user_upload', 'model_generated', 'unknown'}."""
    return {
        "type": BLOCK_TYPE_IMAGE_PLACEHOLDER,
        "ref": ref or "",
        "source": source,
        "mime": mime,
    }


def make_file_placeholder(
    ref: str,
    filename: str | None = None,
    mime: str | None = None,
    size_bytes: int | None = None,
    duration_seconds: float | None = None,
) -> dict:
    return {
        "type": BLOCK_TYPE_FILE_PLACEHOLDER,
        "ref": ref or "",
        "filename": filename,
        "mime": mime,
        "size_bytes": size_bytes,
        "duration_seconds": duration_seconds,
    }


def make_unknown_block(
    raw_type: str,
    observed_keys: list[str] | None = None,
    reason: str = UNKNOWN_REASON_UNKNOWN_TYPE,
    summary: str | None = None,
) -> dict:
    """Construct an unknown block.

    Every call site MUST also emit a WARNING log and increment a LossReport
    tally — see plan §Data-loss visibility. The block surfaces the loss at
    read time; the WARNING surfaces it at run time. Both signals matter.
    """
    keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT]
    return {
        "type": BLOCK_TYPE_UNKNOWN,
        "raw_type": raw_type,
        "observed_keys": keys,
        "reason": reason,
        "summary": summary,
    }


def make_hidden_context_marker(content_type: str) -> dict:
    """A short prepend block that flags the surrounding message as hidden context.

    Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by
    content_type matching. The marker tells the reader "this message is
    hidden in the source UI; we're showing it here for archival fidelity."
    """
    return {
        "type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER,
        "content_type": content_type or "",
    }


# ---------------------------------------------------------------------------
# Rendering
# ---------------------------------------------------------------------------


def render_blocks_to_markdown(blocks: list[dict]) -> str:
    """Render an ordered list of blocks to a single Markdown string.

    Blocks are joined with one blank line between them. Pure function; no I/O.
    """
    if not blocks:
        return ""

    rendered: list[str] = []
    for block in blocks:
        chunk = _render_one(block)
        if chunk:
            rendered.append(chunk)

    return "\n\n".join(rendered)


def _render_one(block: dict) -> str:
    btype = block.get("type", "")
    if btype == BLOCK_TYPE_TEXT:
        return block.get("text", "")
    if btype == BLOCK_TYPE_CODE:
        lang = block.get("language") or ""
        code = block.get("code", "")
        fence = _safe_fence(code)
        return f"{fence}{lang}\n{code}\n{fence}"
    if btype == BLOCK_TYPE_THINKING:
        text = block.get("text", "")
        quoted = _blockquote_prefix(text)
        return f"**💭 Reasoning**\n\n{quoted}"
    if btype == BLOCK_TYPE_TOOL_USE:
        name = block.get("name", "")
        input_data = block.get("input", {})
        body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False)
        fence = _safe_fence(body_json)
        body = f"{fence}json\n{body_json}\n{fence}"
        quoted = _blockquote_prefix(f"🔧 **Tool: {name}**\n{body}")
        return quoted
    if btype == BLOCK_TYPE_TOOL_RESULT:
        output = block.get("output", "")
        is_error = bool(block.get("is_error"))
        tool_name = block.get("tool_name") or ""
        summary = block.get("summary") or ""
        icon = "❌" if is_error else "📤"
        label = "Result (error)" if is_error else "Result"
        if tool_name:
            header = f"{icon} **{label}: {tool_name}**"
        else:
            header = f"{icon} **{label}**"
        fence = _safe_fence(output)
        body = f"{fence}\n{output}\n{fence}"
        if summary:
            inner = f"{header}\n*{summary}*\n{body}"
        else:
            inner = f"{header}\n{body}"
        return _blockquote_prefix(inner)
    if btype == BLOCK_TYPE_CITATION:
        url = block.get("url", "")
        title = block.get("title") or url
        return f"[{title}]({url})"
    if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER:
        ref = block.get("ref", "")
        source = block.get("source", "unknown")
        mime = block.get("mime")
        meta_parts = [source] if source else []
        if mime:
            meta_parts.append(mime)
        meta_parts.append("content not preserved in this export")
        meta = ", ".join(meta_parts)
        return f"> 🖼️ **Image attached** — `{ref}` ({meta})"
    if btype == BLOCK_TYPE_FILE_PLACEHOLDER:
        ref = block.get("ref", "")
        filename = block.get("filename")
        label = filename or ref
        mime = block.get("mime")
        size_bytes = block.get("size_bytes")
        duration = block.get("duration_seconds")
        meta_parts: list[str] = []
        if mime:
            meta_parts.append(mime)
        if isinstance(size_bytes, int) and size_bytes > 0:
            kb = size_bytes / 1024
            meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB")
        if isinstance(duration, (int, float)) and duration > 0:
            meta_parts.append(f"{duration:.2f}s")
        meta_parts.append("content not preserved in this export")
        meta = ", ".join(meta_parts)
        return f"> 📎 **File attached** — `{label}` ({meta})"
    if btype == BLOCK_TYPE_UNKNOWN:
        raw_type = block.get("raw_type", "?")
        reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE)
        keys = block.get("observed_keys") or []
        summary = block.get("summary")
        first_line = f"⚠️ **Unsupported content** — type `{raw_type}` ({reason})"
        lines = [first_line]
        if summary:
            lines.append(summary)
        if keys:
            keys_str = ", ".join(f"`{k}`" for k in keys)
            lines.append(f"Keys observed: {keys_str}")
        return _blockquote_prefix("\n".join(lines))
    if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER:
        ctype = block.get("content_type", "")
        return f"> ℹ️ **Hidden context** — `{ctype}`"

    # Defensive: a block of unrecognised local type (shouldn't happen if
    # constructors are used). Render as visible warning rather than dropping.
    return f"> ⚠️ **Internal: unrecognised block type** — `{btype}`"


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _safe_fence(text: str) -> str:
    """Return a backtick fence longer than the longest run of backticks in ``text``.

    CommonMark requires the closing fence to be at least as long as the opening
    fence. Picking N+1 (where N = longest run in content) ensures the content's
    own backticks are inert. Minimum is 3.

    Verified live against Joplin during planning — see plan
    §Backtick-corruption defense.
    """
    if not isinstance(text, str):
        return "```"
    longest_run = 0
    current_run = 0
    for ch in text:
        if ch == "`":
            current_run += 1
            if current_run > longest_run:
                longest_run = current_run
        else:
            current_run = 0
    fence_len = max(3, longest_run + 1)
    return "`" * fence_len


def _blockquote_prefix(text: str) -> str:
    """Prefix every line of ``text`` with ``> `` so the whole block renders as a quote.

    Empty source lines become ``>`` (no trailing space) so blockquote continuity
    is preserved without trailing-whitespace noise.
    """
    if not isinstance(text, str):
        return ""
    out_lines: list[str] = []
    for line in text.split("\n"):
        if line == "":
            out_lines.append(">")
        else:
            out_lines.append(f"> {line}")
    return "\n".join(out_lines)