From 340293ab940ecf570daa8256b3bb707c74918ee4 Mon Sep 17 00:00:00 2001
From: JesseMarkowitz <Jesse.Markowitz@gmail.com>
Date: Mon, 30 Mar 2026 13:22:05 -0400
Subject: [PATCH] fix for project files not extracted

---
 src/providers/chatgpt.py | 83 ++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/src/providers/chatgpt.py b/src/providers/chatgpt.py
index 2990559..1bf6fdb 100644
--- a/src/providers/chatgpt.py
+++ b/src/providers/chatgpt.py
@@ -622,12 +622,52 @@ def _extract_messages(
             if role in ("user", "assistant"):
                 content_obj = msg_data.get("content", {})
                 content_type = content_obj.get("content_type", "text")
-                text = _extract_text(content_obj, conv_id, node_id)
+                ts = msg_data.get("create_time")
 
-                # model_editable_context carries project instructions as plain text parts
-                _TEXT_EXTRACTABLE = {"text", "model_editable_context"}
+                # Content types whose parts[] contain plain text strings.
+                # model_editable_context / user_editable_context = project instructions
+                # thoughts / reasoning_recap = o1/o3 reasoning traces
+                _TEXT_PARTS_TYPES = {
+                    "text",
+                    "model_editable_context",
+                    "user_editable_context",
+                    "thoughts",
+                    "reasoning_recap",
+                }
 
-                if content_type not in _TEXT_EXTRACTABLE:
+                if content_type in _TEXT_PARTS_TYPES:
+                    text = _extract_text(content_obj, conv_id, node_id)
+                    if text:
+                        messages.append(
+                            {
+                                "role": role,
+                                "content": text,
+                                "content_type": "text",
+                                "timestamp": _ts_to_iso(ts) if ts else None,
+                            }
+                        )
+                    else:
+                        logger.debug(
+                            "[chatgpt] Skipping empty %s message in conversation %s",
+                            content_type,
+                            conv_id[:8],
+                        )
+                elif content_type == "code":
+                    # Inline code response — extract and wrap in a fenced code block
+                    code_text = content_obj.get("text") or "\n".join(
+                        p for p in content_obj.get("parts", []) if isinstance(p, str)
+                    )
+                    language = content_obj.get("language", "")
+                    if code_text:
+                        messages.append(
+                            {
+                                "role": role,
+                                "content": f"```{language}\n{code_text}\n```",
+                                "content_type": "code",
+                                "timestamp": _ts_to_iso(ts) if ts else None,
+                            }
+                        )
+                else:
                     logger.warning(
                         "[chatgpt] Skipping %s content in conversation %s message %s "
                         "— rich content not yet supported (see FUTURE.md)",
@@ -635,20 +675,6 @@ def _extract_messages(
                         conv_id[:8],
                         node_id[:8],
                     )
-                elif text:
-                    ts = msg_data.get("create_time")
-                    messages.append(
-                        {
-                            "role": role,
-                            "content": text,
-                            "content_type": "text",
-                            "timestamp": _ts_to_iso(ts) if ts else None,
-                        }
-                    )
-                else:
-                    logger.debug(
-                        "[chatgpt] Skipping empty message in conversation %s", conv_id[:8]
-                    )
 
         # Walk children in order (ChatGPT typically has one child per node in a linear chat)
         for child_id in node.get("children", []):
@@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None:
 
 
 def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
-    """Extract plain text from a ChatGPT content object."""
+    """Extract plain text from a ChatGPT content object.
+
+    Handles three part shapes:
+    - str  — plain text (most messages)
+    - dict with content_type="text" — wrapped text part
+    - dict with "content" key — o1/o3 thoughts/reasoning parts
+    """
     parts = content_obj.get("parts", [])
     if not parts:
         return ""
@@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
         if isinstance(part, str):
             text_parts.append(part)
         elif isinstance(part, dict):
-            # Could be an image or file reference — skip and warn
-            part_type = part.get("content_type", "unknown")
-            if part_type != "text":
+            part_type = part.get("content_type", "")
+            if part_type == "text":
+                text_parts.append(part.get("text", ""))
+            elif "content" in part:
+                # o1/o3 thoughts parts: {"summary": "...", "content": "..."}
+                text_parts.append(part["content"])
+            elif part_type:
+                # Image, file, or other binary attachment — skip and warn
                 logger.warning(
                     "[chatgpt] Skipping %s attachment in conversation %s "
                     "— rich content not yet supported (see FUTURE.md)",
                     part_type,
                     conv_id[:8],
                 )
-            else:
-                text_parts.append(part.get("text", ""))
 
     return "\n".join(t for t in text_parts if t)