fix for project files not extracted

2026-03-30 13:22:05 -04:00
parent 050cd49124
commit 340293ab94
1 changed files with 59 additions and 24 deletions
--- a/src/providers/chatgpt.py
+++ b/src/providers/chatgpt.py
@@ -622,12 +622,52 @@ def _extract_messages(
            if role in ("user", "assistant"):
                content_obj = msg_data.get("content", {})
                content_type = content_obj.get("content_type", "text")
-                text = _extract_text(content_obj, conv_id, node_id)
+                ts = msg_data.get("create_time")
-                # model_editable_context carries project instructions as plain text parts
+                # Content types whose parts[] contain plain text strings.
-                _TEXT_EXTRACTABLE = {"text", "model_editable_context"}
+                # model_editable_context / user_editable_context = project instructions
                # thoughts / reasoning_recap = o1/o3 reasoning traces
                _TEXT_PARTS_TYPES = {
                    "text",
                    "model_editable_context",
                    "user_editable_context",
                    "thoughts",
                    "reasoning_recap",
                }
-                if content_type not in _TEXT_EXTRACTABLE:
+                if content_type in _TEXT_PARTS_TYPES:
                    text = _extract_text(content_obj, conv_id, node_id)
                    if text:
                        messages.append(
                            {
                                "role": role,
                                "content": text,
                                "content_type": "text",
                                "timestamp": _ts_to_iso(ts) if ts else None,
                            }
                        )
                    else:
                        logger.debug(
                            "[chatgpt] Skipping empty %s message in conversation %s",
                            content_type,
                            conv_id[:8],
                        )
                elif content_type == "code":
                    # Inline code response — extract and wrap in a fenced code block
                    code_text = content_obj.get("text") or "\n".join(
                        p for p in content_obj.get("parts", []) if isinstance(p, str)
                    )
                    language = content_obj.get("language", "")
                    if code_text:
                        messages.append(
                            {
                                "role": role,
                                "content": f"```{language}\n{code_text}\n```",
                                "content_type": "code",
                                "timestamp": _ts_to_iso(ts) if ts else None,
                            }
                        )
                else:
                    logger.warning(
                        "[chatgpt] Skipping %s content in conversation %s message %s "
                        "— rich content not yet supported (see FUTURE.md)",
@@ -635,20 +675,6 @@ def _extract_messages(
                        conv_id[:8],
                        node_id[:8],
                    )
                elif text:
                    ts = msg_data.get("create_time")
                    messages.append(
                        {
                            "role": role,
                            "content": text,
                            "content_type": "text",
                            "timestamp": _ts_to_iso(ts) if ts else None,
                        }
                    )
                else:
                    logger.debug(
                        "[chatgpt] Skipping empty message in conversation %s", conv_id[:8]
                    )
        # Walk children in order (ChatGPT typically has one child per node in a linear chat)
        for child_id in node.get("children", []):
@@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None:
 def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
-    """Extract plain text from a ChatGPT content object."""
+    """Extract plain text from a ChatGPT content object.
    Handles three part shapes:
    - str  — plain text (most messages)
    - dict with content_type="text" — wrapped text part
    - dict with "content" key — o1/o3 thoughts/reasoning parts
    """
    parts = content_obj.get("parts", [])
    if not parts:
        return ""
@@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
        if isinstance(part, str):
            text_parts.append(part)
        elif isinstance(part, dict):
-            # Could be an image or file reference — skip and warn
+            part_type = part.get("content_type", "")
-            part_type = part.get("content_type", "unknown")
+            if part_type == "text":
-            if part_type != "text":
+                text_parts.append(part.get("text", ""))
            elif "content" in part:
                # o1/o3 thoughts parts: {"summary": "...", "content": "..."}
                text_parts.append(part["content"])
            elif part_type:
                # Image, file, or other binary attachment — skip and warn
                logger.warning(
                    "[chatgpt] Skipping %s attachment in conversation %s "
                    "— rich content not yet supported (see FUTURE.md)",
                    part_type,
                    conv_id[:8],
                )
            else:
                text_parts.append(part.get("text", ""))
    return "\n".join(t for t in text_parts if t)