From 473d02f71af3c996e2d6f9daa067b1a9969818e5 Mon Sep 17 00:00:00 2001 From: JesseMarkowitz Date: Mon, 4 May 2026 23:17:18 -0400 Subject: [PATCH] =?UTF-8?q?feat:=20v0.4.0=20=E2=80=94=20rich=20content=20s?= =?UTF-8?q?upport=20with=20typed=20blocks=20and=20loss=20visibility?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracts per-message content into a typed `blocks` list (text, code, thinking, tool_use, tool_result, image_placeholder, file_placeholder, unknown) and renders them at exporter write time. Voice transcripts, Custom Instructions, and image references now appear in exports instead of being silently dropped. Foundation: - src/blocks.py: pure block constructors, _safe_fence (fence-corruption defense, verified live in Joplin), _blockquote_prefix, render - src/loss_report.py: per-run tally surfaced as INFO summary at end of export so silently-dropped data becomes visible Providers: - ChatGPT: dispatch on content_type produces typed blocks; voice shapes (audio_transcription, audio_asset_pointer, real_time_user_audio_video_ asset_pointer) locked from live DevTools capture; Custom Instructions bug fix (parts-vs-direct-fields); role filter lifted; hidden-context marker driven by is_visually_hidden_from_conversation flag - Claude: defensive dispatch for text/thinking/tool_use/tool_result/image with recursive nested-block flattening; untested against real rich- content data β€” fix-forward in v0.4.1 Exporter: - Markdown renders from blocks at write time via render_blocks_to_markdown; backward-compat fallback to content for any pre-v0.4.0 cached data Tests: - 27 new tests across providers, exporters, CLI; fixtures rebuilt with real-shape ChatGPT voice + Custom Instructions cases - 181/181 pass Behavior changes (intentional): - JSON output omits content; consumers should read blocks - Per-conversation message counts increase (Custom Instructions, image- only, tool-only messages now appear) - Existing exports not auto-re-rendered; users wanting fresh output run cache --clear then export Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 35 ++ FUTURE.md | 50 ++- README.md | 4 +- pyproject.toml | 2 +- src/blocks.py | 322 +++++++++++++++ src/exporters/markdown.py | 14 +- src/loss_report.py | 85 ++++ src/main.py | 10 +- src/providers/base.py | 11 +- src/providers/chatgpt.py | 500 +++++++++++++++++++---- src/providers/claude.py | 194 ++++++--- tests/fixtures/chatgpt_conversation.json | 90 +++- tests/fixtures/claude_conversation.json | 9 + tests/test_cli.py | 47 +++ tests/test_exporters.py | 256 +++++++++++- tests/test_providers.py | 389 +++++++++++++++--- 16 files changed, 1786 insertions(+), 232 deletions(-) create mode 100644 src/blocks.py create mode 100644 src/loss_report.py diff --git a/CHANGELOG.md b/CHANGELOG.md index bc76f7b..939fe0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,41 @@ All notable changes to this project will be documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.4.0] - Unreleased +### Added +- Rich content support: messages now carry an ordered `blocks` list (text, code, thinking, tool_use, tool_result, citation, image_placeholder, file_placeholder, unknown) +- ChatGPT voice mode: `audio_transcription` parts render as text blocks; `audio_asset_pointer` and `real_time_user_audio_video_asset_pointer` render as `πŸ“Ž File attached` placeholders with size and duration metadata +- ChatGPT Custom Instructions: `user_editable_context` and `model_editable_context` messages now appear in exports (were silently dropped β€” pre-existing bug fixed); rendered with a `> ℹ️ Hidden context` marker driven by the `is_visually_hidden_from_conversation` flag +- Image placeholders for `image_asset_pointer` parts (uploads + DALL-E) inside `multimodal_text` and at message level +- Defensive Claude block extraction: `text`, `thinking`, `tool_use`, `tool_result` (including nested-block flattening), `image` blocks (untested against real data; will fix-forward in v0.4.1 if real shapes diverge) +- `LossReport` summary table emitted at end of every `export` run, breaking down `unknown blocks` and `extraction failures` by raw type so silently-dropped data becomes visible +- `_safe_fence` helper picks a fence longer than any backtick run in extracted content, preventing embedded triple-backticks from corrupting downstream rendering (verified live in Joplin during planning) +- `unknown` blocks render as `> ⚠️ Unsupported content` with the raw type, observed top-level keys, and reason β€” so future API additions are visible rather than silent + +### Changed +- ChatGPT role filter (previously dropped `tool` and `system` messages) is **lifted**: all roles now route through normal extraction; truly empty messages skip via the existing empty-content guard +- Markdown rendering moves from provider-time to exporter-write-time. Providers produce blocks; exporters call `render_blocks_to_markdown` at write time. This unblocks future Obsidian/HTML exporters +- `BaseProvider.normalize_conversation` signature now accepts an optional `LossReport` parameter (breaking change for any future custom subclass; FileProvider hasn't shipped yet) +- `o1`/`o3` reasoning subparts inside `text` content_type messages remain rendered as plain text (defensive; reclassification to `thinking` block deferred until live shape is captured) + +### Fixed +- `user_editable_context` / `model_editable_context` extraction (parts-vs-direct-fields mismatch) β€” Custom Instructions are no longer silently dropped from every conversation + +### Migration +- Existing exports are not re-rendered automatically. To pick up v0.4.0 rendering for previously exported conversations: + ``` + python -m src.main cache --clear + python -m src.main export --provider all + ``` +- JSON exports: messages now contain `blocks` (typed structured content) and may omit the legacy `content` field. External consumers reading JSON should prefer `blocks`. +- Per-conversation message counts may increase: previously-dropped Custom Instructions, image-only user turns, and tool-only assistant turns now appear. + +### Out of scope (deferred to v0.5.0+) +- Binary downloads of images and audio assets (placeholders show metadata only; `content not preserved in this export`) +- Joplin resource upload for embedded media +- Filename resolution for `file-XYZ` / `sediment://` references +- Speculative ChatGPT types (`tether_browsing_display`, `tether_quote`) and DALL-E assistant images β€” fall through to `unknown` blocks if encountered + ## [0.2.0] - Unreleased ### Added - Joplin import automation: `joplin` command syncs exported Markdown files to Joplin as notes diff --git a/FUTURE.md b/FUTURE.md index 5161bbf..38fecdd 100644 --- a/FUTURE.md +++ b/FUTURE.md @@ -7,6 +7,7 @@ of these additions straightforward. **Completed:** - v0.1.0 β€” Core export: ChatGPT + Claude, incremental sync, Markdown + JSON output - v0.2.0 β€” Joplin import automation (`joplin` command, create/update notes, notebook auto-creation) +- v0.4.0 β€” Rich content support: typed message blocks (text, code, thinking, tool_use, tool_result, image_placeholder, file_placeholder, unknown); ChatGPT voice transcripts as text + audio placeholders; Custom Instructions extraction; data-loss visibility via `LossReport` summary and visible `unknown` blocks --- @@ -58,26 +59,43 @@ export command to accept a pre-downloaded export ZIP or JSON. --- -## Rich Content Support (v0.4.0) +## Binary Content Downloads (v0.5.0) -Currently only text content is exported. Future versions should handle: +v0.4.0 ships placeholders for images and audio assets but does not download +the binary content. The `_safe_fence`-wrapped placeholders include the asset +reference (`sediment://...` or `file-service://...`), MIME type, size, and +duration where available; the actual bytes are not preserved. -### Claude -- Artifacts (code, documents, HTML) β€” export as separate files, link from Markdown -- Uploaded images β€” download and embed or link -- Extended thinking/reasoning blocks β€” include as collapsible sections -- Tool call results and web search citations β€” include as footnotes or appendices +Next steps: +- Download attached images alongside the Markdown export, save under a + `media/` sibling directory with a stable filename derived from the asset + reference. +- Replace `image_placeholder` rendering with an inline `![](relative/path)` + reference once the file is on disk. +- Joplin integration: upload binaries as Joplin resources via `POST /resources`, + rewrite the rendered Markdown to use `:/resourceId` references, and track + the resource ID in the cache manifest so re-syncs stay idempotent. +- DALL-E images on the assistant side: not observed in this user's data; the + code path exists (`source = "model_generated"`) but is untested. -### ChatGPT -- DALL-E generated images β€” download and embed or link -- Code Interpreter outputs β€” export code and results -- File attachments β€” download and reference -- Voice transcripts β€” include as text +The block-level schema is already in place β€” only the file-fetch + rewrite +layer needs to be added. See the `image_placeholder` and `file_placeholder` +block definitions in `src/blocks.py`. -Implementation note: the normalized message schema already includes a -`content_type` field placeholder. When this work begins, extend the schema -rather than replacing it. Non-text content already logs a WARNING when -encountered so users can see what was skipped. +## Reclassify o1/o3 Reasoning Subparts (v0.4.1) + +v0.4.0 leaves dict parts inside `text` content_type messages with shape +`{"summary": ..., "content": ...}` rendered as plain text (defensive β€” the +shape was inferred from a code comment, not captured live). Once a real +reasoning conversation is captured, reclassify these as `thinking` blocks. + +## Suppress Hidden Context (v0.4.x) + +If Custom Instructions duplication across conversations becomes a storage +problem, add `EXPORTER_INCLUDE_HIDDEN_CONTEXT=false` env var. The toggle is +a single `os.getenv()` check at the start of +`_extract_editable_context_blocks` in `src/providers/chatgpt.py` β€” return +empty list if disabled. --- diff --git a/README.md b/README.md index 42efd36..a716497 100644 --- a/README.md +++ b/README.md @@ -426,7 +426,7 @@ Make sure you've added the project IDs to `CHATGPT_PROJECT_IDS` in your `.env`. The provider's internal API may have changed. Run with `--debug`, sanitize the output (remove any personal content), and check the project's GitHub Issues for known fixes. ### Non-text content warnings -Images, code interpreter outputs, DALL-E generations, and Claude artifacts are not exported in v0.2.0. A WARNING is logged for each skipped item. See `FUTURE.md` for the roadmap. +Since v0.4.0, rich content is preserved as typed blocks in the export. ChatGPT voice transcripts render as text and audio assets as `πŸ“Ž File attached` placeholders with size and duration metadata. Images render as `πŸ–ΌοΈ Image attached` placeholders showing the asset reference. Custom Instructions appear under a `> ℹ️ Hidden context` marker. Anything the extractor doesn't recognise renders as a visible `> ⚠️ Unsupported content` block naming the type and observed keys, *and* increments a counter in the post-export summary so you can tell whether real content is being silently skipped. Binary downloads (the actual image/audio bytes) are still deferred β€” see `FUTURE.md` v0.5.0. ### Empty export / all conversations skipped No new or updated conversations since your last run. To verify: `ai-chat-exporter cache --show`. To force a full re-export: `ai-chat-exporter cache --clear`. @@ -444,7 +444,7 @@ See `FUTURE.md` for planned features: - **v0.2.x** β€” `export --force` flag; `joplin --force` flag; per-conversation cache reset - **v0.3.0** β€” Official API fallback: parse export ZIP files from ChatGPT/Claude settings -- **v0.4.0** β€” Rich content: images, artifacts, code interpreter output, extended thinking +- **v0.4.x / v0.5.0** β€” Binary content downloads (images, audio bytes) and Joplin resource upload; reclassify o1/o3 reasoning subparts; optional `EXPORTER_INCLUDE_HIDDEN_CONTEXT` toggle - **v0.5.0** β€” Watch/scheduled mode; Obsidian vault output --- diff --git a/pyproject.toml b/pyproject.toml index dcf27a8..311fff5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ai-chat-exporter" -version = "0.2.1" +version = "0.4.0" description = "Export ChatGPT and Claude conversation history to Markdown for personal archival in Joplin" requires-python = ">=3.11" dependencies = [ diff --git a/src/blocks.py b/src/blocks.py new file mode 100644 index 0000000..91ac161 --- /dev/null +++ b/src/blocks.py @@ -0,0 +1,322 @@ +"""Typed content blocks for normalized messages. + +Providers produce ordered lists of blocks; exporters render them. Living outside +``src/providers/`` deliberately β€” blocks are a separate concern from extraction +or rendering, shared by both layers. + +Block dicts always have ``type`` set to one of the BLOCK_TYPE_* constants. +Construct via the ``make_*`` helpers; never build dicts by hand. The ``unknown`` +block constructor REQUIRES a corresponding WARNING log + ``LossReport`` tally +at the call site β€” see plan Β§Data-loss visibility. +""" + +import json +from typing import Any + +BLOCK_TYPE_TEXT = "text" +BLOCK_TYPE_CODE = "code" +BLOCK_TYPE_THINKING = "thinking" +BLOCK_TYPE_TOOL_USE = "tool_use" +BLOCK_TYPE_TOOL_RESULT = "tool_result" +BLOCK_TYPE_CITATION = "citation" +BLOCK_TYPE_IMAGE_PLACEHOLDER = "image_placeholder" +BLOCK_TYPE_FILE_PLACEHOLDER = "file_placeholder" +BLOCK_TYPE_UNKNOWN = "unknown" +BLOCK_TYPE_HIDDEN_CONTEXT_MARKER = "hidden_context_marker" + +UNKNOWN_REASON_UNKNOWN_TYPE = "unknown_type" +UNKNOWN_REASON_EXTRACTION_FAILED = "extraction_failed" +UNKNOWN_REASON_ALL_BLOCKS_FAILED = "all_blocks_failed" +UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE = "unknown_field_in_known_type" + +_OBSERVED_KEYS_LIMIT = 10 + + +# --------------------------------------------------------------------------- +# Constructors +# --------------------------------------------------------------------------- + + +def make_text_block(text: str) -> dict | None: + """Return a text block, or None if the text is empty/whitespace-only. + + Returning None lets callers do ``if block: blocks.append(block)`` and prune + empty blocks at construction time. See plan Β§Finalizing the message dict. + """ + if not isinstance(text, str) or not text.strip(): + return None + return {"type": BLOCK_TYPE_TEXT, "text": text} + + +def make_code_block(code: str, language: str = "") -> dict | None: + """Return a code block, or None if code is empty.""" + if not isinstance(code, str) or not code.strip(): + return None + return {"type": BLOCK_TYPE_CODE, "language": language or "", "code": code} + + +def make_thinking_block(text: str) -> dict | None: + """Return a thinking block, or None if empty.""" + if not isinstance(text, str) or not text.strip(): + return None + return {"type": BLOCK_TYPE_THINKING, "text": text} + + +def make_tool_use_block(name: str, input_data: Any, tool_id: str | None = None) -> dict: + """Return a tool_use block. + + Always returns a block (no None) β€” tool calls are meaningful even with + empty inputs. + """ + return { + "type": BLOCK_TYPE_TOOL_USE, + "name": name or "", + "input": input_data if input_data is not None else {}, + "tool_id": tool_id, + } + + +def make_tool_result_block( + output: str, + tool_name: str | None = None, + is_error: bool = False, +) -> dict: + """Return a tool_result block.""" + return { + "type": BLOCK_TYPE_TOOL_RESULT, + "tool_name": tool_name, + "output": output if isinstance(output, str) else str(output), + "is_error": bool(is_error), + } + + +def make_citation_block( + url: str, + title: str | None = None, + snippet: str | None = None, +) -> dict | None: + if not url: + return None + return { + "type": BLOCK_TYPE_CITATION, + "url": url, + "title": title, + "snippet": snippet, + } + + +def make_image_placeholder( + ref: str, + source: str = "unknown", + mime: str | None = None, +) -> dict: + """source ∈ {'user_upload', 'model_generated', 'unknown'}.""" + return { + "type": BLOCK_TYPE_IMAGE_PLACEHOLDER, + "ref": ref or "", + "source": source, + "mime": mime, + } + + +def make_file_placeholder( + ref: str, + filename: str | None = None, + mime: str | None = None, + size_bytes: int | None = None, + duration_seconds: float | None = None, +) -> dict: + return { + "type": BLOCK_TYPE_FILE_PLACEHOLDER, + "ref": ref or "", + "filename": filename, + "mime": mime, + "size_bytes": size_bytes, + "duration_seconds": duration_seconds, + } + + +def make_unknown_block( + raw_type: str, + observed_keys: list[str] | None = None, + reason: str = UNKNOWN_REASON_UNKNOWN_TYPE, + summary: str | None = None, +) -> dict: + """Construct an unknown block. + + Every call site MUST also emit a WARNING log and increment a LossReport + tally β€” see plan Β§Data-loss visibility. The block surfaces the loss at + read time; the WARNING surfaces it at run time. Both signals matter. + """ + keys = list(observed_keys or [])[:_OBSERVED_KEYS_LIMIT] + return { + "type": BLOCK_TYPE_UNKNOWN, + "raw_type": raw_type, + "observed_keys": keys, + "reason": reason, + "summary": summary, + } + + +def make_hidden_context_marker(content_type: str) -> dict: + """A short prepend block that flags the surrounding message as hidden context. + + Driven by the ``metadata.is_visually_hidden_from_conversation`` flag, not by + content_type matching. The marker tells the reader "this message is + hidden in the source UI; we're showing it here for archival fidelity." + """ + return { + "type": BLOCK_TYPE_HIDDEN_CONTEXT_MARKER, + "content_type": content_type or "", + } + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def render_blocks_to_markdown(blocks: list[dict]) -> str: + """Render an ordered list of blocks to a single Markdown string. + + Blocks are joined with one blank line between them. Pure function; no I/O. + """ + if not blocks: + return "" + + rendered: list[str] = [] + for block in blocks: + chunk = _render_one(block) + if chunk: + rendered.append(chunk) + + return "\n\n".join(rendered) + + +def _render_one(block: dict) -> str: + btype = block.get("type", "") + if btype == BLOCK_TYPE_TEXT: + return block.get("text", "") + if btype == BLOCK_TYPE_CODE: + lang = block.get("language") or "" + code = block.get("code", "") + fence = _safe_fence(code) + return f"{fence}{lang}\n{code}\n{fence}" + if btype == BLOCK_TYPE_THINKING: + text = block.get("text", "") + quoted = _blockquote_prefix(text) + return f"**πŸ’­ Reasoning**\n\n{quoted}" + if btype == BLOCK_TYPE_TOOL_USE: + name = block.get("name", "") + input_data = block.get("input", {}) + body_json = json.dumps(input_data, indent=2, sort_keys=False, default=str, ensure_ascii=False) + fence = _safe_fence(body_json) + body = f"{fence}json\n{body_json}\n{fence}" + quoted = _blockquote_prefix(f"πŸ”§ **Tool: {name}**\n{body}") + return quoted + if btype == BLOCK_TYPE_TOOL_RESULT: + output = block.get("output", "") + is_error = bool(block.get("is_error")) + header = "❌ **Result (error)**" if is_error else "πŸ“€ **Result**" + fence = _safe_fence(output) + body = f"{fence}\n{output}\n{fence}" + return _blockquote_prefix(f"{header}\n{body}") + if btype == BLOCK_TYPE_CITATION: + url = block.get("url", "") + title = block.get("title") or url + return f"[{title}]({url})" + if btype == BLOCK_TYPE_IMAGE_PLACEHOLDER: + ref = block.get("ref", "") + source = block.get("source", "unknown") + mime = block.get("mime") + meta_parts = [source] if source else [] + if mime: + meta_parts.append(mime) + meta_parts.append("content not preserved in this export") + meta = ", ".join(meta_parts) + return f"> πŸ–ΌοΈ **Image attached** β€” `{ref}` ({meta})" + if btype == BLOCK_TYPE_FILE_PLACEHOLDER: + ref = block.get("ref", "") + filename = block.get("filename") + label = filename or ref + mime = block.get("mime") + size_bytes = block.get("size_bytes") + duration = block.get("duration_seconds") + meta_parts: list[str] = [] + if mime: + meta_parts.append(mime) + if isinstance(size_bytes, int) and size_bytes > 0: + kb = size_bytes / 1024 + meta_parts.append(f"{kb:.1f} KB" if kb < 1024 else f"{kb / 1024:.2f} MB") + if isinstance(duration, (int, float)) and duration > 0: + meta_parts.append(f"{duration:.2f}s") + meta_parts.append("content not preserved in this export") + meta = ", ".join(meta_parts) + return f"> πŸ“Ž **File attached** β€” `{label}` ({meta})" + if btype == BLOCK_TYPE_UNKNOWN: + raw_type = block.get("raw_type", "?") + reason = block.get("reason", UNKNOWN_REASON_UNKNOWN_TYPE) + keys = block.get("observed_keys") or [] + summary = block.get("summary") + first_line = f"⚠️ **Unsupported content** β€” type `{raw_type}` ({reason})" + lines = [first_line] + if summary: + lines.append(summary) + if keys: + keys_str = ", ".join(f"`{k}`" for k in keys) + lines.append(f"Keys observed: {keys_str}") + return _blockquote_prefix("\n".join(lines)) + if btype == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER: + ctype = block.get("content_type", "") + return f"> ℹ️ **Hidden context** β€” `{ctype}`" + + # Defensive: a block of unrecognised local type (shouldn't happen if + # constructors are used). Render as visible warning rather than dropping. + return f"> ⚠️ **Internal: unrecognised block type** β€” `{btype}`" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _safe_fence(text: str) -> str: + """Return a backtick fence longer than the longest run of backticks in ``text``. + + CommonMark requires the closing fence to be at least as long as the opening + fence. Picking N+1 (where N = longest run in content) ensures the content's + own backticks are inert. Minimum is 3. + + Verified live against Joplin during planning β€” see plan + Β§Backtick-corruption defense. + """ + if not isinstance(text, str): + return "```" + longest_run = 0 + current_run = 0 + for ch in text: + if ch == "`": + current_run += 1 + if current_run > longest_run: + longest_run = current_run + else: + current_run = 0 + fence_len = max(3, longest_run + 1) + return "`" * fence_len + + +def _blockquote_prefix(text: str) -> str: + """Prefix every line of ``text`` with ``> `` so the whole block renders as a quote. + + Empty source lines become ``>`` (no trailing space) so blockquote continuity + is preserved without trailing-whitespace noise. + """ + if not isinstance(text, str): + return "" + out_lines: list[str] = [] + for line in text.split("\n"): + if line == "": + out_lines.append(">") + else: + out_lines.append(f"> {line}") + return "\n".join(out_lines) diff --git a/src/exporters/markdown.py b/src/exporters/markdown.py index 3c37fac..de33750 100644 --- a/src/exporters/markdown.py +++ b/src/exporters/markdown.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any +from src.blocks import render_blocks_to_markdown from src.utils import build_export_path, generate_filename logger = logging.getLogger(__name__) @@ -125,10 +126,17 @@ class MarkdownExporter: # Messages for msg in messages: role = msg.get("role", "user") - content = msg.get("content", "") + blocks = msg.get("blocks") or [] timestamp = msg.get("timestamp") - if not content or not content.strip(): + # Prefer rendering from blocks (v0.4.0+). Backward-compat fallback: + # if blocks is missing/empty AND content exists, render content as-is. + if blocks: + body = render_blocks_to_markdown(blocks) + else: + body = msg.get("content", "") or "" + + if not body or not body.strip(): logger.warning( "[markdown] Skipping empty/whitespace message in conversation %s", conv_id[:8], @@ -143,7 +151,7 @@ class MarkdownExporter: else: lines.append("") - lines.append(content) + lines.append(body) lines.append("") lines.append("---") lines.append("") diff --git a/src/loss_report.py b/src/loss_report.py new file mode 100644 index 0000000..389666a --- /dev/null +++ b/src/loss_report.py @@ -0,0 +1,85 @@ +"""Per-export-run tally for content that was dropped or partially extracted. + +Surfaces the loss visibility that the rest of the system promises in its +output (visible ``unknown`` blocks). The summary emitted at the end of +each export is the load-bearing operator-facing signal: if a real content +type starts being silently dropped, this is where it shows up. + +Pass a single instance through ``BaseProvider.normalize_conversation`` and +read it back in ``src/main.py`` after the export loop. No global state. +""" + +from collections import Counter +from dataclasses import dataclass, field + +_TOP_N_BREAKDOWN = 5 + + +@dataclass +class LossReport: + """Counters for things that didn't render cleanly in an export run.""" + + # Type-keyed counters. Values are int counts. + unknown_blocks: Counter = field(default_factory=Counter) + extraction_failures: Counter = field(default_factory=Counter) + filtered_roles: Counter = field(default_factory=Counter) + + # Aggregate counters + messages_rendered: int = 0 + conversations: int = 0 + + # Recording ------------------------------------------------------------- + + def record_unknown(self, raw_type: str) -> None: + self.unknown_blocks[raw_type or "?"] += 1 + + def record_extraction_failure(self, raw_type: str) -> None: + self.extraction_failures[raw_type or "?"] += 1 + + def record_filtered_role(self, role: str) -> None: + self.filtered_roles[role or "?"] += 1 + + def record_message(self) -> None: + self.messages_rendered += 1 + + def record_conversation(self) -> None: + self.conversations += 1 + + # Summary --------------------------------------------------------------- + + def format_summary(self) -> str: + """Return a multi-line summary table suitable for INFO logging. + + Format pinned by plan Β§Post-export summary β€” "(none)" sentinel when a + counter is empty, top-5 breakdown with "+ N more types" overflow. + """ + lines: list[str] = ["[export] Run summary:"] + lines.append(f" conversations: {self.conversations}") + lines.append(f" messages rendered: {self.messages_rendered}") + lines.extend(_format_section("unknown blocks: ", self.unknown_blocks)) + lines.extend(_format_section("extraction failures: ", self.extraction_failures)) + lines.append( + " filtered roles: " + "(filter lifted in v0.4.0 β€” counter retained for future use, expected 0)" + ) + if self.filtered_roles: + for role, count in self.filtered_roles.most_common(_TOP_N_BREAKDOWN): + lines.append(f" {role}={count}") + return "\n".join(lines) + + +def _format_section(label: str, counter: Counter) -> list[str]: + """Render one counter section: header line + indented breakdown lines.""" + total = sum(counter.values()) + header = f" {label} {total}" + if total == 0: + return [header, " (none)"] + + lines = [header] + most_common = counter.most_common() + for raw_type, count in most_common[:_TOP_N_BREAKDOWN]: + lines.append(f" {raw_type}={count}") + if len(most_common) > _TOP_N_BREAKDOWN: + remainder = len(most_common) - _TOP_N_BREAKDOWN + lines.append(f" + {remainder} more types") + return lines diff --git a/src/main.py b/src/main.py index 0dd522e..dbc7975 100644 --- a/src/main.py +++ b/src/main.py @@ -16,6 +16,7 @@ from rich.table import Table from src.cache import Cache, CacheError from src.config import ConfigError from src.logging_config import setup_logging +from src.loss_report import LossReport from src.providers.base import ProviderError console = Console() @@ -554,6 +555,9 @@ def export( # Summary counters summary: dict[str, dict[str, int]] = {} + # Single LossReport tracks data-loss visibility across all providers in this run. + loss_report = LossReport() + for prov_name, prov_instance in providers_to_run: summary[prov_name] = {"exported": 0, "skipped": 0, "failed": 0} @@ -611,7 +615,7 @@ def export( for key, val in raw_conv.items(): if (key.startswith("_") or key in _PROPAGATE_KEYS) and key not in full_raw: full_raw[key] = val - normalized = prov_instance.normalize_conversation(full_raw) + normalized = prov_instance.normalize_conversation(full_raw, loss_report) exported_path: Path | None = None if md_exporter: @@ -642,6 +646,10 @@ def export( if not dry_run: _print_export_summary(summary) + # Emit the data-loss summary at INFO level so it lands in the log file + # AND the operator's console (default level is INFO). + for line in loss_report.format_summary().split("\n"): + logger.info(line) def _resolve_providers(provider: str, cfg) -> list[tuple[str, object]]: diff --git a/src/providers/base.py b/src/providers/base.py index db17e18..b229fee 100644 --- a/src/providers/base.py +++ b/src/providers/base.py @@ -9,6 +9,7 @@ from typing import Any import requests +from src.loss_report import LossReport from src.utils import redact_secrets # curl_cffi has its own exception hierarchy (rooted at CurlError β†’ OSError), @@ -89,8 +90,14 @@ class BaseProvider(ABC): """Return the full conversation detail for a single ID.""" @abstractmethod - def normalize_conversation(self, raw: dict) -> dict: - """Transform provider-specific schema to the common normalized schema.""" + def normalize_conversation(self, raw: dict, loss_report: LossReport | None = None) -> dict: + """Transform provider-specific schema to the common normalized schema. + + ``loss_report`` accumulates counts of dropped/unhandled content so the + export loop can surface a single summary at the end. When None, providers + construct a throwaway local report (so calling normalize_conversation in + isolation, e.g. from tests, doesn't crash). + """ # ------------------------------------------------------------------ # Concrete helpers diff --git a/src/providers/chatgpt.py b/src/providers/chatgpt.py index 6a6ca12..c525885 100644 --- a/src/providers/chatgpt.py +++ b/src/providers/chatgpt.py @@ -25,6 +25,19 @@ from typing import Any from curl_cffi import requests as curl_requests +from src.blocks import ( + UNKNOWN_REASON_EXTRACTION_FAILED, + UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE, + UNKNOWN_REASON_UNKNOWN_TYPE, + make_code_block, + make_file_placeholder, + make_hidden_context_marker, + make_image_placeholder, + make_text_block, + make_thinking_block, + make_unknown_block, +) +from src.loss_report import LossReport from src.providers.base import BaseProvider, ProviderError, REQUEST_TIMEOUT logger = logging.getLogger(__name__) @@ -551,7 +564,7 @@ class ChatGPTProvider(BaseProvider): # Normalization # ------------------------------------------------------------------ - def normalize_conversation(self, raw: dict) -> dict: + def normalize_conversation(self, raw: dict, loss_report: LossReport | None = None) -> dict: """Transform ChatGPT raw schema to the common normalized schema. ChatGPT stores messages in a nested ``mapping`` dict where each node @@ -562,6 +575,7 @@ class ChatGPTProvider(BaseProvider): fetch_all_conversations). The conversation detail endpoint does not include project information. """ + report = loss_report if loss_report is not None else LossReport() conv_id = raw.get("id", "") title = raw.get("title") or "Untitled" created_at = _ts_to_iso(raw.get("create_time")) @@ -580,7 +594,10 @@ class ChatGPTProvider(BaseProvider): ) mapping: dict = raw.get("mapping", {}) - messages = _extract_messages(mapping, raw, conv_id) + messages = _extract_messages(mapping, raw, conv_id, report) + for _ in messages: + report.record_message() + report.record_conversation() return { "id": conv_id, @@ -610,14 +627,18 @@ def _ts_to_iso(ts: float | int | str | None) -> str: def _extract_messages( - mapping: dict[str, Any], raw: dict, conv_id: str + mapping: dict[str, Any], raw: dict, conv_id: str, report: LossReport ) -> list[dict]: - """Walk the ChatGPT conversation mapping tree to produce an ordered message list.""" + """Walk the ChatGPT conversation mapping tree to produce an ordered message list. + + All roles (user/assistant/system/tool) are processed; the prior filter that + dropped non-user/assistant messages is lifted in v0.4.0 β€” truly empty + messages skip via the empty-content guard, anything with content renders. + """ if not mapping: logger.warning("[chatgpt] Conversation %s has empty mapping", conv_id[:8]) return [] - # Find the root node (the one that has no parent, or whose parent is None/not in mapping) root_id = _find_root(mapping) if root_id is None: logger.warning( @@ -635,68 +656,12 @@ def _extract_messages( node = mapping.get(node_id, {}) msg_data = node.get("message") - if msg_data: - role = msg_data.get("author", {}).get("role", "") - # Skip system/tool messages silently unless they have visible content - if role in ("user", "assistant"): - content_obj = msg_data.get("content", {}) - content_type = content_obj.get("content_type", "text") - ts = msg_data.get("create_time") + built = _build_message(msg_data, conv_id, node_id, report) + if built is not None: + messages.append(built) - # Content types whose parts[] contain plain text strings. - # model_editable_context / user_editable_context = project instructions - # thoughts / reasoning_recap = o1/o3 reasoning traces - _TEXT_PARTS_TYPES = { - "text", - "model_editable_context", - "user_editable_context", - "thoughts", - "reasoning_recap", - } - - if content_type in _TEXT_PARTS_TYPES: - text = _extract_text(content_obj, conv_id, node_id) - if text: - messages.append( - { - "role": role, - "content": text, - "content_type": "text", - "timestamp": _ts_to_iso(ts) if ts else None, - } - ) - else: - logger.debug( - "[chatgpt] Skipping empty %s message in conversation %s", - content_type, - conv_id[:8], - ) - elif content_type == "code": - # Inline code response β€” extract and wrap in a fenced code block - code_text = content_obj.get("text") or "\n".join( - p for p in content_obj.get("parts", []) if isinstance(p, str) - ) - language = content_obj.get("language", "") - if code_text: - messages.append( - { - "role": role, - "content": f"```{language}\n{code_text}\n```", - "content_type": "code", - "timestamp": _ts_to_iso(ts) if ts else None, - } - ) - else: - logger.warning( - "[chatgpt] Skipping %s content in conversation %s message %s " - "β€” rich content not yet supported (see FUTURE.md)", - content_type, - conv_id[:8], - node_id[:8], - ) - - # Walk children in order (ChatGPT typically has one child per node in a linear chat) + # Walk children in order (linear in typical conversations) for child_id in node.get("children", []): walk(child_id) @@ -718,36 +683,405 @@ def _find_root(mapping: dict[str, Any]) -> str | None: return None -def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str: - """Extract plain text from a ChatGPT content object. +def _build_message( + msg_data: dict, conv_id: str, node_id: str, report: LossReport +) -> dict | None: + """Construct a normalized message dict (with ``blocks``) for one ChatGPT node. - Handles three part shapes: - - str β€” plain text (most messages) - - dict with content_type="text" β€” wrapped text part - - dict with "content" key β€” o1/o3 thoughts/reasoning parts + Returns None for messages that should be skipped (truly empty). Otherwise + returns a dict with ``role``, ``content_type``, ``timestamp``, ``blocks``. """ - parts = content_obj.get("parts", []) - if not parts: - return "" + author = msg_data.get("author") or {} + role = author.get("role", "") or "" + if role not in ("user", "assistant", "system", "tool"): + # Unrecognised role β€” log and surface, but pass through so role metadata + # is preserved for the reader. + logger.debug( + "[chatgpt] Unrecognised role %r in conversation %s message %s", + role, + conv_id[:8], + node_id[:8], + ) + + content_obj = msg_data.get("content") or {} + content_type = content_obj.get("content_type", "text") + ts = msg_data.get("create_time") + metadata = msg_data.get("metadata") or {} + is_hidden = bool(metadata.get("is_visually_hidden_from_conversation")) + + blocks = _extract_blocks_for_content( + content_type, content_obj, role, conv_id, node_id, report + ) + + if not blocks: + logger.debug( + "[chatgpt] Skipping empty %s message in conversation %s", + content_type, + conv_id[:8], + ) + return None + + if is_hidden: + # Prepend a marker so the reader knows this message is hidden in the + # source UI. The marker is content-type-agnostic. + blocks = [make_hidden_context_marker(content_type)] + blocks + + # Vestigial content_type: "code" for code-only messages, otherwise "text" + msg_content_type = "code" if ( + len(blocks) == 1 and blocks[0].get("type") == "code" + ) else "text" + + return { + "role": role or "user", + "content_type": msg_content_type, + "timestamp": _ts_to_iso(ts) if ts else None, + "blocks": blocks, + } + + +# Content types whose ``parts`` are plain text strings. +_PLAIN_TEXT_PARTS_TYPES = {"text"} +# Content types that carry inline reasoning/thoughts. +_THINKING_TYPES = {"thoughts", "reasoning_recap"} +# Custom-Instructions / model-context types β€” direct fields, NOT parts. +_DIRECT_FIELD_CONTEXT_TYPES = { + "user_editable_context", + "model_editable_context", +} +# Known direct fields per context type. Anything not listed but non-null +# becomes an `unknown` block per the no-silent-drop-of-non-null-fields rule. +_USER_EDITABLE_CONTEXT_KNOWN_FIELDS = ("user_profile", "user_instructions") +_MODEL_EDITABLE_CONTEXT_KNOWN_FIELDS = ( + "model_set_context", + "repository", + "repo_summary", + "structured_context", +) + + +def _extract_blocks_for_content( + content_type: str, + content_obj: dict, + role: str, + conv_id: str, + node_id: str, + report: LossReport, +) -> list[dict]: + """Dispatch on content_type and return a list of blocks for one message.""" + + if content_type in _PLAIN_TEXT_PARTS_TYPES: + return _extract_text_content_type_blocks(content_obj, conv_id, node_id, report) + + if content_type == "multimodal_text": + return _extract_multimodal_blocks(content_obj, role, conv_id, node_id, report) + + if content_type == "code": + code_text = content_obj.get("text") or "\n".join( + p for p in content_obj.get("parts", []) if isinstance(p, str) + ) + language = content_obj.get("language", "") or "" + block = make_code_block(code_text, language) + return [block] if block else [] + + if content_type in _THINKING_TYPES: + text = _join_string_parts(content_obj) + block = make_thinking_block(text) + return [block] if block else [] + + if content_type in _DIRECT_FIELD_CONTEXT_TYPES: + return _extract_editable_context_blocks( + content_type, content_obj, conv_id, node_id, report + ) + + if content_type == "image_asset_pointer": + # Top-level image (rare β€” usually nested inside multimodal_text). + ref = content_obj.get("asset_pointer", "") + source = "user_upload" if role == "user" else "model_generated" + return [make_image_placeholder(ref=ref, source=source)] + + # Unknown content_type β†’ visible unknown block + WARNING + tally + keys = list(content_obj.keys()) + logger.warning( + "[chatgpt] Unknown content_type %r in conversation %s message %s " + "β€” see plan Β§Data-loss visibility (rendering as unknown block)", + content_type, + conv_id[:8], + node_id[:8], + ) + report.record_unknown(content_type or "?") + return [ + make_unknown_block( + raw_type=content_type or "?", + observed_keys=keys, + reason=UNKNOWN_REASON_UNKNOWN_TYPE, + ) + ] + + +def _extract_text_content_type_blocks( + content_obj: dict, conv_id: str, node_id: str, report: LossReport +) -> list[dict]: + """Extract blocks for ``content_type == "text"``. + + Plural-parts rule: emit ONE text block per message with all string parts + joined by ``\\n``. Don't emit one block per part. + + Dict parts inside a text content_type message (the suspected o1/o3 reasoning + subpart shape ``{"summary": ..., "content": ...}``) are preserved as text + today β€” defensive behavior pending real-data capture in v0.4.1. + """ + parts = content_obj.get("parts", []) or [] + string_chunks: list[str] = [] - text_parts = [] for part in parts: if isinstance(part, str): - text_parts.append(part) + string_chunks.append(part) elif isinstance(part, dict): part_type = part.get("content_type", "") if part_type == "text": - text_parts.append(part.get("text", "")) + txt = part.get("text", "") or "" + if txt: + string_chunks.append(txt) elif "content" in part: - # o1/o3 thoughts parts: {"summary": "...", "content": "..."} - text_parts.append(part["content"]) + # Suspected o1/o3 reasoning subpart. Defensive: preserve as text + # block (matches current behavior). v0.4.1 reclassifies once + # the real shape is captured live. + content_val = part.get("content", "") or "" + if content_val: + string_chunks.append(content_val) elif part_type: - # Image, file, or other binary attachment β€” skip and warn + # Non-text dict part inside a text content_type β€” surface it. logger.warning( - "[chatgpt] Skipping %s attachment in conversation %s " - "β€” rich content not yet supported (see FUTURE.md)", + "[chatgpt] Unexpected %s part inside text content_type " + "in conversation %s message %s β€” rendering as unknown block", part_type, conv_id[:8], + node_id[:8], + ) + report.record_unknown(part_type) + # Inline mark in the joined text so order is preserved. + string_chunks.append( + f"\n[Unknown part: type={part_type}; " + f"keys={list(part.keys())[:10]}]\n" ) - return "\n".join(t for t in text_parts if t) + joined = "\n".join(c for c in string_chunks if c) + block = make_text_block(joined) + return [block] if block else [] + + +def _join_string_parts(content_obj: dict) -> str: + """Helper: join all string parts in ``parts`` with newlines.""" + parts = content_obj.get("parts", []) or [] + return "\n".join(p for p in parts if isinstance(p, str) and p) + + +def _extract_multimodal_blocks( + content_obj: dict, role: str, conv_id: str, node_id: str, report: LossReport +) -> list[dict]: + """Extract blocks from a ``multimodal_text`` content object. + + Walks ``parts`` in array order β€” order varies between user and assistant + turns, and the extractor preserves source ordering. Emits text + + image_placeholder + file_placeholder blocks per part. + """ + parts = content_obj.get("parts", []) or [] + blocks: list[dict] = [] + + for part in parts: + if isinstance(part, str): + block = make_text_block(part) + if block: + blocks.append(block) + continue + + if not isinstance(part, dict): + continue + + part_type = part.get("content_type", "") + + if part_type == "audio_transcription": + txt = part.get("text", "") or "" + block = make_text_block(txt) + if block: + blocks.append(block) + elif "text" not in part: + logger.warning( + "[chatgpt] audio_transcription part missing 'text' key " + "in conversation %s message %s", + conv_id[:8], + node_id[:8], + ) + report.record_extraction_failure("audio_transcription") + blocks.append( + make_unknown_block( + raw_type="audio_transcription", + observed_keys=list(part.keys()), + reason=UNKNOWN_REASON_EXTRACTION_FAILED, + summary="expected key 'text' not found", + ) + ) + continue + + if part_type == "image_asset_pointer": + ref = part.get("asset_pointer", "") + source = "user_upload" if role == "user" else "model_generated" + mime = None + blocks.append(make_image_placeholder(ref=ref, source=source, mime=mime)) + continue + + if part_type == "audio_asset_pointer": + blocks.append(_audio_asset_placeholder(part)) + continue + + if part_type == "real_time_user_audio_video_asset_pointer": + # Wrapper carrying a nested audio_asset_pointer + optional video frames. + nested_audio = part.get("audio_asset_pointer") + if isinstance(nested_audio, dict): + blocks.append(_audio_asset_placeholder(nested_audio)) + else: + logger.warning( + "[chatgpt] real_time_user_audio_video_asset_pointer missing " + "nested audio_asset_pointer in conversation %s message %s", + conv_id[:8], + node_id[:8], + ) + report.record_extraction_failure( + "real_time_user_audio_video_asset_pointer" + ) + blocks.append( + make_unknown_block( + raw_type="real_time_user_audio_video_asset_pointer", + observed_keys=list(part.keys()), + reason=UNKNOWN_REASON_EXTRACTION_FAILED, + summary="expected nested 'audio_asset_pointer' not found", + ) + ) + + frames = part.get("frames_asset_pointers") or [] + if frames: + # Defensive: empty in all observed cases, but if non-empty + # surface as a separate file placeholder. + video_ref = part.get("video_container_asset_pointer") or "(video frames)" + blocks.append( + make_file_placeholder( + ref=str(video_ref), + mime="video/unknown", + ) + ) + continue + + # Anything else inside multimodal_text β€” visible unknown block + logger.warning( + "[chatgpt] Unknown multimodal_text part type %r in conversation %s message %s", + part_type, + conv_id[:8], + node_id[:8], + ) + report.record_unknown(part_type or "?") + blocks.append( + make_unknown_block( + raw_type=part_type or "?", + observed_keys=list(part.keys()), + reason=UNKNOWN_REASON_UNKNOWN_TYPE, + ) + ) + + return blocks + + +def _audio_asset_placeholder(audio_part: dict) -> dict: + """Build a file_placeholder for an audio_asset_pointer dict. + + Handles missing/zero metadata defensively. + """ + ref = audio_part.get("asset_pointer", "") or "" + fmt = audio_part.get("format") or "unknown" + size_bytes = audio_part.get("size_bytes") + if not isinstance(size_bytes, int) or size_bytes <= 0: + size_bytes = None + metadata = audio_part.get("metadata") or {} + start = metadata.get("start") if isinstance(metadata, dict) else None + end = metadata.get("end") if isinstance(metadata, dict) else None + duration: float | None = None + if isinstance(start, (int, float)) and isinstance(end, (int, float)): + diff = float(end) - float(start) + if diff > 0: + duration = diff + return make_file_placeholder( + ref=ref, + mime=f"audio/{fmt}" if fmt else "audio/unknown", + size_bytes=size_bytes, + duration_seconds=duration, + ) + + +def _extract_editable_context_blocks( + content_type: str, content_obj: dict, conv_id: str, node_id: str, report: LossReport +) -> list[dict]: + """Extract blocks from user_editable_context / model_editable_context messages. + + These have no ``parts`` field β€” they carry direct keys. Read all known + fields, emit one labeled fenced block per non-null known field, and emit an + ``unknown`` block for any unrecognised non-null direct field (no-silent-drop + rule). + """ + if content_type == "user_editable_context": + known_fields: tuple[str, ...] = _USER_EDITABLE_CONTEXT_KNOWN_FIELDS + elif content_type == "model_editable_context": + known_fields = _MODEL_EDITABLE_CONTEXT_KNOWN_FIELDS + else: + known_fields = () + + blocks: list[dict] = [] + label_kind = "Custom Instructions" if content_type == "user_editable_context" else "Model Context" + + for field in known_fields: + value = content_obj.get(field) + if value is None or (isinstance(value, str) and not value.strip()): + continue + if isinstance(value, (dict, list)): + # Render as a JSON-rendered text block. _safe_fence will wrap it. + import json as _json + rendered = _json.dumps(value, indent=2, default=str, ensure_ascii=False) + else: + rendered = str(value) + label = f"**{label_kind} β€” {field}:**" + # Emit as text block; the renderer's _safe_fence wraps the raw value. + # We use a "labeled fenced block" pattern: header line + raw content + # joined inside one text block, where the renderer will leave it alone. + # To get the safe-fence wrap we use a code block (which calls _safe_fence + # internally and renders without language-hint corruption risk). + blocks.append(make_text_block(label)) + code_block = make_code_block(rendered, language="") + if code_block: + blocks.append(code_block) + + # Catch unknown non-null direct fields (no-silent-drop rule). + structural_keys = {"content_type", "parts"} + for key, value in content_obj.items(): + if key in structural_keys or key in known_fields: + continue + if value is None: + continue + # Reject null/empty containers. + if isinstance(value, (str, list, dict)) and not value: + continue + logger.warning( + "[chatgpt] Unknown non-null field %r in %s message %s/%s", + key, + content_type, + conv_id[:8], + node_id[:8], + ) + report.record_unknown(f"{content_type}.{key}") + blocks.append( + make_unknown_block( + raw_type=f"{content_type}.{key}", + observed_keys=list(content_obj.keys()), + reason=UNKNOWN_REASON_UNKNOWN_FIELD_IN_KNOWN_TYPE, + summary=f"unknown non-null field '{key}' in {content_type}", + ) + ) + + return blocks diff --git a/src/providers/claude.py b/src/providers/claude.py index 01fcbc4..0e9d66a 100644 --- a/src/providers/claude.py +++ b/src/providers/claude.py @@ -5,6 +5,17 @@ import os from curl_cffi import requests as curl_requests +from src.blocks import ( + UNKNOWN_REASON_EXTRACTION_FAILED, + UNKNOWN_REASON_UNKNOWN_TYPE, + make_image_placeholder, + make_text_block, + make_thinking_block, + make_tool_result_block, + make_tool_use_block, + make_unknown_block, +) +from src.loss_report import LossReport from src.providers.base import BaseProvider, ProviderError logger = logging.getLogger(__name__) @@ -161,8 +172,9 @@ class ClaudeProvider(BaseProvider): return data - def normalize_conversation(self, raw: dict) -> dict: + def normalize_conversation(self, raw: dict, loss_report: LossReport | None = None) -> dict: """Transform Claude raw schema to the common normalized schema.""" + report = loss_report if loss_report is not None else LossReport() conv_id = raw.get("uuid") or raw.get("id", "") title = raw.get("name") or raw.get("title") or "Untitled" created_at = raw.get("created_at") or raw.get("create_time") or "" @@ -178,40 +190,37 @@ class ClaudeProvider(BaseProvider): # Messages raw_messages = raw.get("chat_messages") or raw.get("messages") or [] - messages = [] + messages: list[dict] = [] for msg in raw_messages: role = _map_role(msg.get("sender") or msg.get("role", "")) if not role: continue - # Content can be a string or a list of content blocks - content_raw = msg.get("content") or msg.get("text") or "" - content, skipped_types = _extract_claude_text(content_raw, conv_id) - - for ctype in skipped_types: - logger.warning( - "[claude] Skipping %s content in conversation %s " - "β€” rich content not yet supported (see FUTURE.md)", - ctype, - conv_id[:8], - ) + content_raw = msg.get("content") if "content" in msg else msg.get("text", "") + blocks = _extract_claude_blocks(content_raw, conv_id, report) timestamp = msg.get("created_at") or msg.get("timestamp") or None - if content is None: + if not blocks: logger.debug("[claude] Skipping empty message in conversation %s", conv_id[:8]) continue + content_type = "text" + messages.append( { "role": role, - "content": content, - "content_type": "text", + "content_type": content_type, "timestamp": timestamp, + "blocks": blocks, } ) + for _ in messages: + report.record_message() + report.record_conversation() + return { "id": conv_id, "title": title, @@ -242,43 +251,134 @@ def _map_role(sender: str) -> str | None: return mapping.get(sender.lower()) if sender else None -def _extract_claude_text( - content: str | list | dict, conv_id: str -) -> tuple[str | None, list[str]]: - """Extract plain text from a Claude content field. +def _extract_claude_blocks( + content: str | list | dict | None, conv_id: str, report: LossReport +) -> list[dict]: + """Extract typed blocks from a Claude content field. - Returns: - (text_or_None, list_of_skipped_content_types) + Defensive dispatch β€” zero observed cases of rich Claude content in the + user's archive at planning time, so this is theory-only. Real shapes + will be locked in v0.4.1 once captured. Any unrecognised block type + surfaces as an `unknown` block + WARNING + tally. """ - skipped: list[str] = [] + if content is None: + return [] if isinstance(content, str): - text = content.strip() - return (text if text else None), skipped + block = make_text_block(content) + return [block] if block else [] if isinstance(content, list): - parts: list[str] = [] - for block in content: - if isinstance(block, str): - parts.append(block) - elif isinstance(block, dict): - btype = block.get("type", "text") - if btype == "text": - t = block.get("text", "").strip() - if t: - parts.append(t) - else: - skipped.append(btype) - text = "\n".join(parts).strip() - return (text if text else None), skipped + blocks: list[dict] = [] + for item in content: + if isinstance(item, str): + block = make_text_block(item) + if block: + blocks.append(block) + elif isinstance(item, dict): + blocks.extend(_dispatch_claude_block(item, conv_id, report)) + return blocks if isinstance(content, dict): - btype = content.get("type", "text") - if btype == "text": - text = content.get("text", "").strip() - return (text if text else None), skipped - else: - skipped.append(btype) - return None, skipped + return _dispatch_claude_block(content, conv_id, report) - return None, skipped + return [] + + +def _dispatch_claude_block(block: dict, conv_id: str, report: LossReport) -> list[dict]: + """Translate one raw Claude content block into normalized blocks.""" + btype = block.get("type", "text") + + if btype == "text": + block_obj = make_text_block(block.get("text", "") or "") + return [block_obj] if block_obj else [] + + if btype == "thinking": + # Claude extended-thinking blocks may use 'thinking' or 'text' field. + text = block.get("thinking") or block.get("text") or "" + block_obj = make_thinking_block(text) + return [block_obj] if block_obj else [] + + if btype == "tool_use": + return [ + make_tool_use_block( + name=block.get("name", "") or "", + input_data=block.get("input"), + tool_id=block.get("id"), + ) + ] + + if btype == "tool_result": + # ``content`` may be a string or a list of nested blocks (recursive). + nested = block.get("content") + output = _flatten_tool_result_content(nested, conv_id, report) + return [ + make_tool_result_block( + output=output, + tool_name=None, + is_error=bool(block.get("is_error")), + ) + ] + + if btype == "image": + # Source shape is unverified; try the most likely fields. + source = block.get("source") or {} + ref = "" + if isinstance(source, dict): + ref = ( + source.get("file_uuid") + or source.get("media_type") + or source.get("url") + or "" + ) + return [make_image_placeholder(ref=ref or "(unknown)", source="user_upload")] + + # Unknown block type + keys = list(block.keys()) + logger.warning( + "[claude] Unknown block type %r in conversation %s " + "β€” see plan Β§Data-loss visibility (rendering as unknown block)", + btype, + conv_id[:8], + ) + report.record_unknown(btype or "?") + return [ + make_unknown_block( + raw_type=btype or "?", + observed_keys=keys, + reason=UNKNOWN_REASON_UNKNOWN_TYPE, + ) + ] + + +def _flatten_tool_result_content( + nested: object, conv_id: str, report: LossReport +) -> str: + """Flatten Claude tool_result content (string OR list of nested blocks) to text. + + Recurses into nested text blocks; any non-text nested block becomes a + visible inline marker so non-text content isn't silently dropped. + """ + if nested is None: + return "" + if isinstance(nested, str): + return nested + if isinstance(nested, list): + chunks: list[str] = [] + for item in nested: + if isinstance(item, str): + chunks.append(item) + elif isinstance(item, dict): + btype = item.get("type", "text") + if btype == "text": + chunks.append(item.get("text", "") or "") + else: + keys = list(item.keys())[:10] + report.record_extraction_failure(f"tool_result.{btype}") + chunks.append( + f"[Unsupported nested {btype} block; keys={keys}]" + ) + return "\n".join(c for c in chunks if c) + if isinstance(nested, dict): + return _flatten_tool_result_content([nested], conv_id, report) + return str(nested) diff --git a/tests/fixtures/chatgpt_conversation.json b/tests/fixtures/chatgpt_conversation.json index e8de3d8..427bd66 100644 --- a/tests/fixtures/chatgpt_conversation.json +++ b/tests/fixtures/chatgpt_conversation.json @@ -8,12 +8,30 @@ "node-root": { "id": "node-root", "parent": null, - "children": ["node-1"], + "children": ["node-uec"], "message": null }, + "node-uec": { + "id": "node-uec", + "parent": "node-root", + "children": ["node-1"], + "message": { + "id": "node-uec", + "author": {"role": "user"}, + "create_time": null, + "content": { + "content_type": "user_editable_context", + "user_profile": "Preferred name: Jesse", + "user_instructions": "The user provided the additional info about how they would like you to respond:\n```Always cite sources.```" + }, + "metadata": { + "is_visually_hidden_from_conversation": true + } + } + }, "node-1": { "id": "node-1", - "parent": "node-root", + "parent": "node-uec", "children": ["node-2"], "message": { "id": "node-1", @@ -28,7 +46,7 @@ "node-2": { "id": "node-2", "parent": "node-1", - "children": ["node-3"], + "children": ["node-mm-user"], "message": { "id": "node-2", "author": {"role": "assistant"}, @@ -39,17 +57,71 @@ } } }, - "node-3": { - "id": "node-3", + "node-mm-user": { + "id": "node-mm-user", "parent": "node-2", - "children": [], + "children": ["node-mm-assistant"], "message": { - "id": "node-3", + "id": "node-mm-user", "author": {"role": "user"}, "create_time": 1704067300.0, "content": { - "content_type": "image_asset_pointer", - "parts": [{"content_type": "image_asset_pointer", "asset_pointer": "file://some-image"}] + "content_type": "multimodal_text", + "parts": [ + {"content_type": "audio_transcription", "text": "What is the capital of France?", "direction": "in", "decoding_id": null}, + {"content_type": "real_time_user_audio_video_asset_pointer", "frames_asset_pointers": [], "video_container_asset_pointer": null, "audio_asset_pointer": {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_user001", "size_bytes": 50000, "format": "wav", "metadata": {"start": 0.0, "end": 2.5}}, "audio_start_timestamp": 1.0} + ] + }, + "metadata": {"voice_mode_message": true} + } + }, + "node-mm-assistant": { + "id": "node-mm-assistant", + "parent": "node-mm-user", + "children": ["node-mm-user-rev"], + "message": { + "id": "node-mm-assistant", + "author": {"role": "assistant"}, + "create_time": 1704067305.0, + "content": { + "content_type": "multimodal_text", + "parts": [ + {"content_type": "audio_transcription", "text": "The capital of France is Paris.", "direction": "out", "decoding_id": null}, + {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_assistant001", "size_bytes": 80000, "format": "wav", "metadata": {"start": 0.0, "end": 3.2}} + ] + } + } + }, + "node-mm-user-rev": { + "id": "node-mm-user-rev", + "parent": "node-mm-assistant", + "children": ["node-image-only"], + "message": { + "id": "node-mm-user-rev", + "author": {"role": "user"}, + "create_time": 1704067400.0, + "content": { + "content_type": "multimodal_text", + "parts": [ + {"content_type": "real_time_user_audio_video_asset_pointer", "frames_asset_pointers": [], "video_container_asset_pointer": null, "audio_asset_pointer": {"content_type": "audio_asset_pointer", "asset_pointer": "sediment://file_user002", "size_bytes": 30000, "format": "wav", "metadata": {"start": 0.0, "end": 1.5}}, "audio_start_timestamp": 5.0}, + {"content_type": "audio_transcription", "text": "Tell me more please.", "direction": "in", "decoding_id": null} + ] + } + } + }, + "node-image-only": { + "id": "node-image-only", + "parent": "node-mm-user-rev", + "children": [], + "message": { + "id": "node-image-only", + "author": {"role": "user"}, + "create_time": 1704067500.0, + "content": { + "content_type": "multimodal_text", + "parts": [ + {"content_type": "image_asset_pointer", "asset_pointer": "file-service://image001"} + ] } } } diff --git a/tests/fixtures/claude_conversation.json b/tests/fixtures/claude_conversation.json index 291021e..720969d 100644 --- a/tests/fixtures/claude_conversation.json +++ b/tests/fixtures/claude_conversation.json @@ -30,6 +30,15 @@ "sender": "human", "created_at": "2024-06-10T14:45:00.000Z", "content": "Thank you, that helped!" + }, + { + "uuid": "msg-004", + "sender": "human", + "created_at": "2024-06-10T14:50:00.000Z", + "content": [ + {"type": "text", "text": "What about this image?"}, + {"type": "image", "source": {"file_uuid": "claude-image-uuid-1", "media_type": "image/png"}} + ] } ] } diff --git a/tests/test_cli.py b/tests/test_cli.py index 9ea4794..17065cf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -127,3 +127,50 @@ class TestExportSinceValidation: }, ) assert "Invalid --since date" not in result.output + + +# --------------------------------------------------------------------------- +# LossReport summary +# --------------------------------------------------------------------------- + + +class TestLossReportSummary: + """The LossReport's format_summary() pinned format covers zero, top-5, and overflow cases.""" + + def test_zero_summary_uses_none_sentinel(self): + from src.loss_report import LossReport + + report = LossReport() + out = report.format_summary() + assert "[export] Run summary:" in out + assert "conversations: 0" in out + assert "messages rendered: 0" in out + # Both "(none)" sentinels present β€” never empty parens + assert out.count("(none)") == 2 + + def test_top_5_breakdown(self): + from src.loss_report import LossReport + + report = LossReport() + for raw_type in ("a", "b", "c", "d", "e", "f", "g"): + report.record_unknown(raw_type) + if raw_type == "a": + # Make 'a' the most common + for _ in range(4): + report.record_unknown("a") + out = report.format_summary() + # Top entry shown + assert "a=5" in out + # Overflow line present (7 types, top 5 + 2 more) + assert "+ 2 more types" in out + + def test_messages_and_conversations_recorded(self): + from src.loss_report import LossReport + + report = LossReport() + report.record_conversation() + report.record_message() + report.record_message() + out = report.format_summary() + assert "conversations: 1" in out + assert "messages rendered: 2" in out diff --git a/tests/test_exporters.py b/tests/test_exporters.py index 1536482..7010974 100644 --- a/tests/test_exporters.py +++ b/tests/test_exporters.py @@ -1,4 +1,4 @@ -"""Unit tests for src/exporters/.""" +"""Unit tests for src/exporters/ and src/blocks.py.""" import json import os @@ -7,6 +7,23 @@ from pathlib import Path import pytest +from src.blocks import ( + BLOCK_TYPE_TEXT, + UNKNOWN_REASON_EXTRACTION_FAILED, + UNKNOWN_REASON_UNKNOWN_TYPE, + _blockquote_prefix, + _safe_fence, + make_code_block, + make_file_placeholder, + make_hidden_context_marker, + make_image_placeholder, + make_text_block, + make_thinking_block, + make_tool_result_block, + make_tool_use_block, + make_unknown_block, + render_blocks_to_markdown, +) from src.exporters.markdown import MarkdownExporter, _yaml_escape, _format_timestamp from src.exporters.json_export import JSONExporter @@ -250,3 +267,240 @@ class TestFormatTimestamp: def test_empty_string(self): assert _format_timestamp("") == "" + + +# --------------------------------------------------------------------------- +# Block helpers and rendering +# --------------------------------------------------------------------------- + + +class TestSafeFence: + def test_minimum_three_backticks(self): + assert _safe_fence("plain text") == "```" + + def test_four_backticks_when_three_in_content(self): + assert _safe_fence("here ``` is a fence") == "````" + + def test_five_backticks_when_four_in_content(self): + assert _safe_fence("here ```` is four") == "`````" + + def test_handles_empty_string(self): + assert _safe_fence("") == "```" + + def test_handles_run_at_end(self): + # Trailing run still counted + assert _safe_fence("text ending in ```") == "````" + + +class TestBlockquotePrefix: + def test_single_line(self): + assert _blockquote_prefix("hello") == "> hello" + + def test_multi_line(self): + assert _blockquote_prefix("a\nb\nc") == "> a\n> b\n> c" + + def test_empty_lines_become_naked_quote_marker(self): + assert _blockquote_prefix("a\n\nb") == "> a\n>\n> b" + + def test_empty_string(self): + assert _blockquote_prefix("") == ">" + + +class TestBlockConstructors: + def test_make_text_block_returns_none_for_empty(self): + assert make_text_block("") is None + assert make_text_block(" ") is None + + def test_make_text_block_returns_dict(self): + b = make_text_block("hello") + assert b == {"type": "text", "text": "hello"} + + def test_make_code_block_returns_none_for_empty(self): + assert make_code_block("") is None + + def test_make_thinking_block_returns_none_for_empty(self): + assert make_thinking_block("") is None + + +class TestRenderBlocks: + def test_text_block_renders_as_paragraph(self): + out = render_blocks_to_markdown([make_text_block("Hello world")]) + assert out == "Hello world" + + def test_blocks_separated_by_blank_line(self): + out = render_blocks_to_markdown( + [make_text_block("first"), make_text_block("second")] + ) + assert out == "first\n\nsecond" + + def test_code_block_with_language(self): + out = render_blocks_to_markdown([make_code_block("print(1)", language="python")]) + assert "```python" in out + assert "print(1)" in out + + def test_thinking_block_uses_blockquote(self): + out = render_blocks_to_markdown([make_thinking_block("step 1\nstep 2")]) + assert "**πŸ’­ Reasoning**" in out + assert "> step 1" in out + assert "> step 2" in out + + def test_tool_use_renders_as_blockquote_with_safe_fence(self): + out = render_blocks_to_markdown( + [make_tool_use_block("search", {"query": "test"})] + ) + assert "> πŸ”§ **Tool: search**" in out + # Every line of the body is blockquote-prefixed + assert "> ```json" in out + assert "> }" in out + + def test_tool_use_with_multiline_input(self): + out = render_blocks_to_markdown( + [make_tool_use_block("complex", {"a": 1, "b": [{"x": "y"}]})] + ) + # Prefix every line of multi-line JSON + for line in out.split("\n"): + assert line.startswith(">") or line == "" + + def test_tool_result_success_uses_outbox_icon(self): + out = render_blocks_to_markdown([make_tool_result_block("OK")]) + assert "πŸ“€ **Result**" in out + assert "❌" not in out + + def test_tool_result_error_uses_x_icon(self): + out = render_blocks_to_markdown([make_tool_result_block("oops", is_error=True)]) + assert "❌ **Result (error)**" in out + assert "πŸ“€" not in out + + def test_image_placeholder_rendering(self): + out = render_blocks_to_markdown( + [make_image_placeholder(ref="file-123", source="user_upload")] + ) + assert "πŸ–ΌοΈ **Image attached**" in out + assert "`file-123`" in out + assert "user_upload" in out + assert "content not preserved" in out + + def test_file_placeholder_with_metadata(self): + out = render_blocks_to_markdown( + [make_file_placeholder(ref="sediment://x", mime="audio/wav", size_bytes=10240, duration_seconds=2.5)] + ) + assert "πŸ“Ž **File attached**" in out + assert "audio/wav" in out + assert "KB" in out + assert "2.50s" in out + + def test_unknown_block_renders_with_keys(self): + out = render_blocks_to_markdown( + [ + make_unknown_block( + raw_type="future_x", + observed_keys=["foo", "bar"], + reason=UNKNOWN_REASON_UNKNOWN_TYPE, + ) + ] + ) + assert "⚠️ **Unsupported content**" in out + assert "future_x" in out + assert "`foo`" in out + assert "`bar`" in out + + def test_unknown_extraction_failed_includes_summary(self): + out = render_blocks_to_markdown( + [ + make_unknown_block( + raw_type="audio_transcription", + observed_keys=["asset_pointer"], + reason=UNKNOWN_REASON_EXTRACTION_FAILED, + summary="expected key 'text' not found", + ) + ] + ) + assert "extraction_failed" in out + assert "expected key 'text' not found" in out + + def test_hidden_context_marker(self): + out = render_blocks_to_markdown( + [make_hidden_context_marker("user_editable_context")] + ) + assert "ℹ️ **Hidden context**" in out + assert "`user_editable_context`" in out + + def test_safe_fence_prevents_runaway_code_block(self): + # Content contains an unbalanced opening fence β€” without _safe_fence + # this would corrupt downstream rendering. + evil_content = "before\n```Follow\ntext\nraw is: \"```" + block = make_code_block(evil_content) + out = render_blocks_to_markdown([block, make_text_block("after")]) + # The 4-backtick wrap should be present + assert "````" in out + # The "after" text should appear OUTSIDE any code block β€” it follows + # the closing ```` fence. + assert out.endswith("after") + + def test_block_order_preserved(self): + blocks = [ + make_text_block("a"), + make_image_placeholder(ref="r1", source="user_upload"), + make_text_block("b"), + ] + out = render_blocks_to_markdown(blocks) + assert out.index("a") < out.index("Image attached") + assert out.index("Image attached") < out.index("b") + + +# --------------------------------------------------------------------------- +# Markdown exporter with blocks +# --------------------------------------------------------------------------- + + +SAMPLE_CONV_BLOCKS = { + "id": "blocks12345", + "title": "Blocks Conversation", + "provider": "claude", + "project": None, + "created_at": "2024-06-10T14:32:00Z", + "updated_at": "2024-06-10T15:00:00Z", + "message_count": 1, + "messages": [ + { + "role": "assistant", + "content_type": "text", + "timestamp": None, + "blocks": [ + {"type": "text", "text": "Here is the answer."}, + {"type": "tool_use", "name": "search", "input": {"q": "x"}, "tool_id": "t1"}, + ], + } + ], +} + + +class TestMarkdownExporterWithBlocks: + def test_renders_blocks(self, tmp_path): + exp = MarkdownExporter(tmp_path) + path = exp.export(SAMPLE_CONV_BLOCKS) + body = path.read_text() + assert "Here is the answer." in body + assert "πŸ”§ **Tool: search**" in body + + def test_falls_back_to_content_when_blocks_missing(self, tmp_path): + # Backward-compat: messages with `content` only (no `blocks`) still render. + exp = MarkdownExporter(tmp_path) + path = exp.export(SAMPLE_CONV) # SAMPLE_CONV has content only, no blocks + body = path.read_text() + assert "Hello, how are you?" in body + + def test_skips_messages_with_neither_blocks_nor_content(self, tmp_path): + conv = { + **SAMPLE_CONV_BLOCKS, + "messages": [ + {"role": "user", "content_type": "text", "timestamp": None, "blocks": []}, + {"role": "assistant", "content_type": "text", "timestamp": None, "blocks": [ + {"type": "text", "text": "I am here."} + ]}, + ], + } + exp = MarkdownExporter(tmp_path) + path = exp.export(conv) + body = path.read_text() + assert "I am here." in body diff --git a/tests/test_providers.py b/tests/test_providers.py index 8097610..8d7ea41 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,19 +1,52 @@ """Unit tests for src/providers/ using fixture files.""" import json +import logging from pathlib import Path import pytest +from src.blocks import ( + BLOCK_TYPE_FILE_PLACEHOLDER, + BLOCK_TYPE_HIDDEN_CONTEXT_MARKER, + BLOCK_TYPE_IMAGE_PLACEHOLDER, + BLOCK_TYPE_TEXT, + BLOCK_TYPE_THINKING, + BLOCK_TYPE_TOOL_RESULT, + BLOCK_TYPE_TOOL_USE, + BLOCK_TYPE_UNKNOWN, +) +from src.loss_report import LossReport + FIXTURES = Path(__file__).parent / "fixtures" +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _block_types(message: dict) -> list[str]: + return [b.get("type") for b in (message.get("blocks") or [])] + + +def _first_block(message: dict, block_type: str) -> dict | None: + for b in message.get("blocks") or []: + if b.get("type") == block_type: + return b + return None + + +# --------------------------------------------------------------------------- +# ChatGPT +# --------------------------------------------------------------------------- + + class TestChatGPTNormalization: - """Test ChatGPTProvider.normalize_conversation() using fixture data.""" + """ChatGPT normalize_conversation block-extraction behavior.""" def _get_provider(self): from src.providers.chatgpt import ChatGPTProvider - # Bypass __init__ token check p = ChatGPTProvider.__new__(ChatGPTProvider) import requests p._session = requests.Session() @@ -31,7 +64,6 @@ class TestChatGPTNormalization: assert result["id"] == "chatgpt-conv-001" assert result["title"] == "Python Async Tutorial" assert result["provider"] == "chatgpt" - # No entry in _project_map β†’ project is None assert result["project"] is None assert result["created_at"] != "" assert result["updated_at"] != "" @@ -46,7 +78,6 @@ class TestChatGPTNormalization: assert result["id"] == "chatgpt-conv-002" def test_normalizes_with_project_from_map(self): - """Project name from _project_map (populated by fetch_all_conversations) flows through.""" raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) p = self._get_provider() p._project_map["chatgpt-conv-001"] = "My Research Project" @@ -54,32 +85,167 @@ class TestChatGPTNormalization: assert result["project"] == "My Research Project" - def test_extracts_text_messages(self): + def test_text_message_emits_text_block(self): raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) p = self._get_provider() result = p.normalize_conversation(raw) - assert len(result["messages"]) >= 2 user_msgs = [m for m in result["messages"] if m["role"] == "user"] - assert any("async" in m["content"].lower() for m in user_msgs) + # The "How does async/await..." message + async_msgs = [ + m for m in user_msgs + if any( + "async" in (b.get("text") or "").lower() + for b in (m.get("blocks") or []) + ) + ] + assert async_msgs, "expected a user message about async/await" + assert _block_types(async_msgs[0]) == [BLOCK_TYPE_TEXT] - def test_skips_non_text_content_with_warning(self, caplog): - import logging + def test_code_block_preserved_with_language(self): raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) p = self._get_provider() - with caplog.at_level(logging.WARNING): - result = p.normalize_conversation(raw) - # The fixture has an image_asset_pointer node β€” should be warned about - assert any( - "image_asset_pointer" in r.message or "rich content" in r.message - for r in caplog.records - ) + result = p.normalize_conversation(raw) - def test_model_editable_context_included_without_warning(self, caplog): - """model_editable_context messages (project instructions) should be included, not warned about.""" - import logging - conv = { - "id": "test-conv-mec", + assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"] + # The first assistant message is the async/await answer with a python fence + text_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TEXT) + assert text_block is not None + assert "```python" in text_block["text"] + + def test_multimodal_voice_user_message(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + # node-mm-user: audio_transcription "What is the capital of France?" + # + real_time_user_audio_video_asset_pointer wrapping a sediment:// URL + capital_msgs = [ + m for m in result["messages"] + if any( + "capital of france" in (b.get("text") or "").lower() + for b in (m.get("blocks") or []) + ) + ] + assert capital_msgs, "expected the audio_transcription text to surface" + types = _block_types(capital_msgs[0]) + assert BLOCK_TYPE_TEXT in types + assert BLOCK_TYPE_FILE_PLACEHOLDER in types + + file_block = _first_block(capital_msgs[0], BLOCK_TYPE_FILE_PLACEHOLDER) + assert file_block["ref"].startswith("sediment://") + assert file_block["mime"] == "audio/wav" + assert file_block["size_bytes"] == 50000 + assert file_block["duration_seconds"] == pytest.approx(2.5) + + def test_multimodal_voice_reverse_order_preserved(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + # node-mm-user-rev has parts in REVERSE order: asset first, transcription second. + rev_msgs = [ + m for m in result["messages"] + if any( + "tell me more" in (b.get("text") or "").lower() + for b in (m.get("blocks") or []) + ) + ] + assert rev_msgs, "expected the reverse-order voice message" + types = _block_types(rev_msgs[0]) + # Order preserved: file_placeholder before text + assert types == [BLOCK_TYPE_FILE_PLACEHOLDER, BLOCK_TYPE_TEXT] + + def test_image_only_user_message_renders(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + image_msgs = [ + m for m in result["messages"] + if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or [])) + ] + assert image_msgs, "image-only user message should now render" + + def test_user_editable_context_emits_blocks(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + # The user_editable_context message has user_profile + user_instructions. + # It should now appear (was silently dropped pre-v0.4.0). + uec_msgs = [ + m for m in result["messages"] + if any( + "Custom Instructions" in (b.get("text") or "") + for b in (m.get("blocks") or []) + ) + ] + assert uec_msgs, "user_editable_context should be visible in output" + # Hidden context marker should be prepended. + assert uec_msgs[0]["blocks"][0]["type"] == BLOCK_TYPE_HIDDEN_CONTEXT_MARKER + + def test_user_editable_context_uses_safe_fence(self): + """The user_instructions value contains embedded triple-backticks; the rendered + Markdown must use a fence longer than 3 backticks so embedded fences are inert. + """ + from src.blocks import render_blocks_to_markdown + + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + uec_msgs = [ + m for m in result["messages"] + if any( + "Custom Instructions" in (b.get("text") or "") + for b in (m.get("blocks") or []) + ) + ] + assert uec_msgs + rendered = render_blocks_to_markdown(uec_msgs[0]["blocks"]) + # Content has ``` inside, so the wrap fence must be at least 4 backticks. + assert "````" in rendered, "expected a 4+ backtick safe-fence wrap" + + def test_message_roles_are_valid(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + for msg in result["messages"]: + assert msg["role"] in ("user", "assistant", "system", "tool") + + def test_message_count_matches(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + assert result["message_count"] == len(result["messages"]) + + def test_loss_report_records_messages(self): + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + report = LossReport() + result = p.normalize_conversation(raw, report) + assert report.messages_rendered == len(result["messages"]) + assert report.conversations == 1 + + +class TestChatGPTUnknownContent: + """Unrecognised content types should produce visible unknown blocks + WARNING + tally.""" + + def _get_provider(self): + from src.providers.chatgpt import ChatGPTProvider + p = ChatGPTProvider.__new__(ChatGPTProvider) + import requests + p._session = requests.Session() + p._org_id = None + p._project_ids = [] + p._project_map = {} + p._project_name_cache = {} + return p + + def _make_unknown_conv(self): + return { + "id": "test-unknown", "title": "Test", "create_time": 1700000000.0, "update_time": 1700000001.0, @@ -91,46 +257,45 @@ class TestChatGPTNormalization: "id": "msg1", "author": {"role": "user"}, "content": { - "content_type": "model_editable_context", - "parts": ["These are the project instructions."], + "content_type": "future_unknown_type_xyz", + "some_field": "value", }, - "create_time": 1700000001.0, - "status": "finished_successfully", }, "parent": "root", "children": [], }, }, } + + def test_unknown_content_type_produces_unknown_block(self): + p = self._get_provider() + result = p.normalize_conversation(self._make_unknown_conv()) + assert any( + b.get("type") == BLOCK_TYPE_UNKNOWN + for m in result["messages"] + for b in (m.get("blocks") or []) + ) + + def test_unknown_content_type_logs_warning(self, caplog): p = self._get_provider() with caplog.at_level(logging.WARNING): - result = p.normalize_conversation(conv) - assert any(m["content"] == "These are the project instructions." for m in result["messages"]) - assert not any("model_editable_context" in r.message for r in caplog.records) + p.normalize_conversation(self._make_unknown_conv()) + assert any("future_unknown_type_xyz" in r.message for r in caplog.records) - def test_message_roles_are_valid(self): - raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + def test_unknown_content_type_increments_loss_report(self): p = self._get_provider() - result = p.normalize_conversation(raw) - for msg in result["messages"]: - assert msg["role"] in ("user", "assistant", "system") + report = LossReport() + p.normalize_conversation(self._make_unknown_conv(), report) + assert report.unknown_blocks["future_unknown_type_xyz"] == 1 - def test_message_count_matches(self): - raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) - p = self._get_provider() - result = p.normalize_conversation(raw) - assert result["message_count"] == len(result["messages"]) - def test_code_fence_preserved(self): - raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) - p = self._get_provider() - result = p.normalize_conversation(raw) - all_content = " ".join(m["content"] for m in result["messages"]) - assert "```python" in all_content +# --------------------------------------------------------------------------- +# Claude +# --------------------------------------------------------------------------- class TestClaudeNormalization: - """Test ClaudeProvider.normalize_conversation() using fixture data.""" + """Claude normalize_conversation block-extraction behavior.""" def _get_provider(self): from src.providers.claude import ClaudeProvider @@ -150,55 +315,138 @@ class TestClaudeNormalization: assert result["provider"] == "claude" assert result["project"] == "StarTOS Packaging" assert result["created_at"] == "2024-06-10T14:32:00.000Z" - assert isinstance(result["messages"], list) def test_normalizes_without_project(self): raw = json.loads((FIXTURES / "claude_no_project.json").read_text()) p = self._get_provider() result = p.normalize_conversation(raw) - assert result["project"] is None - assert result["id"] == "claude-conv-002" - def test_string_content_extracted(self): - raw = json.loads((FIXTURES / "claude_no_project.json").read_text()) + def test_string_content_emits_text_block(self): + raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) p = self._get_provider() result = p.normalize_conversation(raw) - assert any("Docker" in m["content"] for m in result["messages"]) + thanks_msgs = [ + m for m in result["messages"] + if any( + "thank you" in (b.get("text") or "").lower() + for b in (m.get("blocks") or []) + ) + ] + assert thanks_msgs - def test_list_content_extracted(self): + def test_list_content_emits_blocks_in_order(self): raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) p = self._get_provider() result = p.normalize_conversation(raw) assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"] - assert any("manifest" in m["content"].lower() for m in assistant_msgs) + # msg-002 has text + tool_use, in that order. + assert assistant_msgs + types = _block_types(assistant_msgs[0]) + assert BLOCK_TYPE_TEXT in types + assert BLOCK_TYPE_TOOL_USE in types + # Order preserved + assert types.index(BLOCK_TYPE_TEXT) < types.index(BLOCK_TYPE_TOOL_USE) - def test_non_text_blocks_skipped_with_warning(self, caplog): - import logging + def test_tool_use_block_fields(self): raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) p = self._get_provider() - with caplog.at_level(logging.WARNING): - result = p.normalize_conversation(raw) - # The fixture has a tool_use block β€” should warn + result = p.normalize_conversation(raw) + + assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"] + tool_block = _first_block(assistant_msgs[0], BLOCK_TYPE_TOOL_USE) + assert tool_block["name"] == "search" + assert tool_block["input"] == {"query": "startOS docs"} + assert tool_block["tool_id"] == "tool-001" + + def test_image_block_emits_image_placeholder(self): + raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) + p = self._get_provider() + result = p.normalize_conversation(raw) + + msg004 = [ + m for m in result["messages"] + if any(b.get("type") == BLOCK_TYPE_IMAGE_PLACEHOLDER for b in (m.get("blocks") or [])) + ] + assert msg004 + img = _first_block(msg004[0], BLOCK_TYPE_IMAGE_PLACEHOLDER) + assert img["ref"] == "claude-image-uuid-1" + + def test_unknown_block_type_records_loss(self): + from src.blocks import BLOCK_TYPE_UNKNOWN as _UNK + raw = { + "uuid": "test-unknown", + "name": "T", + "chat_messages": [ + { + "uuid": "m1", + "sender": "human", + "content": [{"type": "future_block_xyz", "data": "..."}], + } + ], + } + p = self._get_provider() + report = LossReport() + result = p.normalize_conversation(raw, report) assert any( - "tool_use" in r.message or "rich content" in r.message - for r in caplog.records + b.get("type") == _UNK + for m in result["messages"] + for b in (m.get("blocks") or []) ) + assert report.unknown_blocks["future_block_xyz"] == 1 - def test_message_count_matches(self): - raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) + def test_thinking_block(self): + raw = { + "uuid": "thinking-test", + "name": "T", + "chat_messages": [ + { + "uuid": "m1", + "sender": "assistant", + "content": [ + {"type": "thinking", "thinking": "Let me reason about this."}, + {"type": "text", "text": "Here's the answer."}, + ], + } + ], + } p = self._get_provider() result = p.normalize_conversation(raw) - assert result["message_count"] == len(result["messages"]) + types = _block_types(result["messages"][0]) + assert BLOCK_TYPE_THINKING in types + assert BLOCK_TYPE_TEXT in types - def test_roles_normalized(self): - raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) + def test_tool_result_with_nested_text_blocks(self): + raw = { + "uuid": "tool-result-test", + "name": "T", + "chat_messages": [ + { + "uuid": "m1", + "sender": "assistant", + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool-001", + "content": [ + {"type": "text", "text": "search hit 1"}, + {"type": "text", "text": "search hit 2"}, + ], + "is_error": False, + } + ], + } + ], + } p = self._get_provider() result = p.normalize_conversation(raw) - for msg in result["messages"]: - assert msg["role"] in ("user", "assistant", "system") + tool_result = _first_block(result["messages"][0], BLOCK_TYPE_TOOL_RESULT) + assert tool_result is not None + assert "search hit 1" in tool_result["output"] + assert "search hit 2" in tool_result["output"] + assert tool_result["is_error"] is False def test_human_sender_maps_to_user(self): raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) @@ -207,3 +455,10 @@ class TestClaudeNormalization: roles = {m["role"] for m in result["messages"]} assert "user" in roles assert "human" not in roles + + def test_loss_report_messages_recorded(self): + raw = json.loads((FIXTURES / "claude_conversation.json").read_text()) + p = self._get_provider() + report = LossReport() + result = p.normalize_conversation(raw, report) + assert report.messages_rendered == len(result["messages"])