fix for project files not extracted

This commit is contained in:
JesseMarkowitz
2026-03-30 13:22:05 -04:00
parent 050cd49124
commit 340293ab94

View File

@@ -622,21 +622,22 @@ def _extract_messages(
if role in ("user", "assistant"): if role in ("user", "assistant"):
content_obj = msg_data.get("content", {}) content_obj = msg_data.get("content", {})
content_type = content_obj.get("content_type", "text") content_type = content_obj.get("content_type", "text")
text = _extract_text(content_obj, conv_id, node_id)
# model_editable_context carries project instructions as plain text parts
_TEXT_EXTRACTABLE = {"text", "model_editable_context"}
if content_type not in _TEXT_EXTRACTABLE:
logger.warning(
"[chatgpt] Skipping %s content in conversation %s message %s "
"— rich content not yet supported (see FUTURE.md)",
content_type,
conv_id[:8],
node_id[:8],
)
elif text:
ts = msg_data.get("create_time") ts = msg_data.get("create_time")
# Content types whose parts[] contain plain text strings.
# model_editable_context / user_editable_context = project instructions
# thoughts / reasoning_recap = o1/o3 reasoning traces
_TEXT_PARTS_TYPES = {
"text",
"model_editable_context",
"user_editable_context",
"thoughts",
"reasoning_recap",
}
if content_type in _TEXT_PARTS_TYPES:
text = _extract_text(content_obj, conv_id, node_id)
if text:
messages.append( messages.append(
{ {
"role": role, "role": role,
@@ -647,7 +648,32 @@ def _extract_messages(
) )
else: else:
logger.debug( logger.debug(
"[chatgpt] Skipping empty message in conversation %s", conv_id[:8] "[chatgpt] Skipping empty %s message in conversation %s",
content_type,
conv_id[:8],
)
elif content_type == "code":
# Inline code response — extract and wrap in a fenced code block
code_text = content_obj.get("text") or "\n".join(
p for p in content_obj.get("parts", []) if isinstance(p, str)
)
language = content_obj.get("language", "")
if code_text:
messages.append(
{
"role": role,
"content": f"```{language}\n{code_text}\n```",
"content_type": "code",
"timestamp": _ts_to_iso(ts) if ts else None,
}
)
else:
logger.warning(
"[chatgpt] Skipping %s content in conversation %s message %s "
"— rich content not yet supported (see FUTURE.md)",
content_type,
conv_id[:8],
node_id[:8],
) )
# Walk children in order (ChatGPT typically has one child per node in a linear chat) # Walk children in order (ChatGPT typically has one child per node in a linear chat)
@@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None:
def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str: def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
"""Extract plain text from a ChatGPT content object.""" """Extract plain text from a ChatGPT content object.
Handles three part shapes:
- str — plain text (most messages)
- dict with content_type="text" — wrapped text part
- dict with "content" key — o1/o3 thoughts/reasoning parts
"""
parts = content_obj.get("parts", []) parts = content_obj.get("parts", [])
if not parts: if not parts:
return "" return ""
@@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
if isinstance(part, str): if isinstance(part, str):
text_parts.append(part) text_parts.append(part)
elif isinstance(part, dict): elif isinstance(part, dict):
# Could be an image or file reference — skip and warn part_type = part.get("content_type", "")
part_type = part.get("content_type", "unknown") if part_type == "text":
if part_type != "text": text_parts.append(part.get("text", ""))
elif "content" in part:
# o1/o3 thoughts parts: {"summary": "...", "content": "..."}
text_parts.append(part["content"])
elif part_type:
# Image, file, or other binary attachment — skip and warn
logger.warning( logger.warning(
"[chatgpt] Skipping %s attachment in conversation %s " "[chatgpt] Skipping %s attachment in conversation %s "
"— rich content not yet supported (see FUTURE.md)", "— rich content not yet supported (see FUTURE.md)",
part_type, part_type,
conv_id[:8], conv_id[:8],
) )
else:
text_parts.append(part.get("text", ""))
return "\n".join(t for t in text_parts if t) return "\n".join(t for t in text_parts if t)