From 340293ab940ecf570daa8256b3bb707c74918ee4 Mon Sep 17 00:00:00 2001 From: JesseMarkowitz Date: Mon, 30 Mar 2026 13:22:05 -0400 Subject: [PATCH] fix for project files not extracted --- src/providers/chatgpt.py | 83 ++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/src/providers/chatgpt.py b/src/providers/chatgpt.py index 2990559..1bf6fdb 100644 --- a/src/providers/chatgpt.py +++ b/src/providers/chatgpt.py @@ -622,12 +622,52 @@ def _extract_messages( if role in ("user", "assistant"): content_obj = msg_data.get("content", {}) content_type = content_obj.get("content_type", "text") - text = _extract_text(content_obj, conv_id, node_id) + ts = msg_data.get("create_time") - # model_editable_context carries project instructions as plain text parts - _TEXT_EXTRACTABLE = {"text", "model_editable_context"} + # Content types whose parts[] contain plain text strings. + # model_editable_context / user_editable_context = project instructions + # thoughts / reasoning_recap = o1/o3 reasoning traces + _TEXT_PARTS_TYPES = { + "text", + "model_editable_context", + "user_editable_context", + "thoughts", + "reasoning_recap", + } - if content_type not in _TEXT_EXTRACTABLE: + if content_type in _TEXT_PARTS_TYPES: + text = _extract_text(content_obj, conv_id, node_id) + if text: + messages.append( + { + "role": role, + "content": text, + "content_type": "text", + "timestamp": _ts_to_iso(ts) if ts else None, + } + ) + else: + logger.debug( + "[chatgpt] Skipping empty %s message in conversation %s", + content_type, + conv_id[:8], + ) + elif content_type == "code": + # Inline code response — extract and wrap in a fenced code block + code_text = content_obj.get("text") or "\n".join( + p for p in content_obj.get("parts", []) if isinstance(p, str) + ) + language = content_obj.get("language", "") + if code_text: + messages.append( + { + "role": role, + "content": f"```{language}\n{code_text}\n```", + "content_type": "code", + "timestamp": _ts_to_iso(ts) if ts else None, + } + ) + else: logger.warning( "[chatgpt] Skipping %s content in conversation %s message %s " "— rich content not yet supported (see FUTURE.md)", @@ -635,20 +675,6 @@ def _extract_messages( conv_id[:8], node_id[:8], ) - elif text: - ts = msg_data.get("create_time") - messages.append( - { - "role": role, - "content": text, - "content_type": "text", - "timestamp": _ts_to_iso(ts) if ts else None, - } - ) - else: - logger.debug( - "[chatgpt] Skipping empty message in conversation %s", conv_id[:8] - ) # Walk children in order (ChatGPT typically has one child per node in a linear chat) for child_id in node.get("children", []): @@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None: def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str: - """Extract plain text from a ChatGPT content object.""" + """Extract plain text from a ChatGPT content object. + + Handles three part shapes: + - str — plain text (most messages) + - dict with content_type="text" — wrapped text part + - dict with "content" key — o1/o3 thoughts/reasoning parts + """ parts = content_obj.get("parts", []) if not parts: return "" @@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str: if isinstance(part, str): text_parts.append(part) elif isinstance(part, dict): - # Could be an image or file reference — skip and warn - part_type = part.get("content_type", "unknown") - if part_type != "text": + part_type = part.get("content_type", "") + if part_type == "text": + text_parts.append(part.get("text", "")) + elif "content" in part: + # o1/o3 thoughts parts: {"summary": "...", "content": "..."} + text_parts.append(part["content"]) + elif part_type: + # Image, file, or other binary attachment — skip and warn logger.warning( "[chatgpt] Skipping %s attachment in conversation %s " "— rich content not yet supported (see FUTURE.md)", part_type, conv_id[:8], ) - else: - text_parts.append(part.get("text", "")) return "\n".join(t for t in text_parts if t)