fix for project files not extracted
This commit is contained in:
@@ -622,21 +622,22 @@ def _extract_messages(
|
||||
if role in ("user", "assistant"):
|
||||
content_obj = msg_data.get("content", {})
|
||||
content_type = content_obj.get("content_type", "text")
|
||||
text = _extract_text(content_obj, conv_id, node_id)
|
||||
|
||||
# model_editable_context carries project instructions as plain text parts
|
||||
_TEXT_EXTRACTABLE = {"text", "model_editable_context"}
|
||||
|
||||
if content_type not in _TEXT_EXTRACTABLE:
|
||||
logger.warning(
|
||||
"[chatgpt] Skipping %s content in conversation %s message %s "
|
||||
"— rich content not yet supported (see FUTURE.md)",
|
||||
content_type,
|
||||
conv_id[:8],
|
||||
node_id[:8],
|
||||
)
|
||||
elif text:
|
||||
ts = msg_data.get("create_time")
|
||||
|
||||
# Content types whose parts[] contain plain text strings.
|
||||
# model_editable_context / user_editable_context = project instructions
|
||||
# thoughts / reasoning_recap = o1/o3 reasoning traces
|
||||
_TEXT_PARTS_TYPES = {
|
||||
"text",
|
||||
"model_editable_context",
|
||||
"user_editable_context",
|
||||
"thoughts",
|
||||
"reasoning_recap",
|
||||
}
|
||||
|
||||
if content_type in _TEXT_PARTS_TYPES:
|
||||
text = _extract_text(content_obj, conv_id, node_id)
|
||||
if text:
|
||||
messages.append(
|
||||
{
|
||||
"role": role,
|
||||
@@ -647,7 +648,32 @@ def _extract_messages(
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"[chatgpt] Skipping empty message in conversation %s", conv_id[:8]
|
||||
"[chatgpt] Skipping empty %s message in conversation %s",
|
||||
content_type,
|
||||
conv_id[:8],
|
||||
)
|
||||
elif content_type == "code":
|
||||
# Inline code response — extract and wrap in a fenced code block
|
||||
code_text = content_obj.get("text") or "\n".join(
|
||||
p for p in content_obj.get("parts", []) if isinstance(p, str)
|
||||
)
|
||||
language = content_obj.get("language", "")
|
||||
if code_text:
|
||||
messages.append(
|
||||
{
|
||||
"role": role,
|
||||
"content": f"```{language}\n{code_text}\n```",
|
||||
"content_type": "code",
|
||||
"timestamp": _ts_to_iso(ts) if ts else None,
|
||||
}
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"[chatgpt] Skipping %s content in conversation %s message %s "
|
||||
"— rich content not yet supported (see FUTURE.md)",
|
||||
content_type,
|
||||
conv_id[:8],
|
||||
node_id[:8],
|
||||
)
|
||||
|
||||
# Walk children in order (ChatGPT typically has one child per node in a linear chat)
|
||||
@@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None:
|
||||
|
||||
|
||||
def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
|
||||
"""Extract plain text from a ChatGPT content object."""
|
||||
"""Extract plain text from a ChatGPT content object.
|
||||
|
||||
Handles three part shapes:
|
||||
- str — plain text (most messages)
|
||||
- dict with content_type="text" — wrapped text part
|
||||
- dict with "content" key — o1/o3 thoughts/reasoning parts
|
||||
"""
|
||||
parts = content_obj.get("parts", [])
|
||||
if not parts:
|
||||
return ""
|
||||
@@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
|
||||
if isinstance(part, str):
|
||||
text_parts.append(part)
|
||||
elif isinstance(part, dict):
|
||||
# Could be an image or file reference — skip and warn
|
||||
part_type = part.get("content_type", "unknown")
|
||||
if part_type != "text":
|
||||
part_type = part.get("content_type", "")
|
||||
if part_type == "text":
|
||||
text_parts.append(part.get("text", ""))
|
||||
elif "content" in part:
|
||||
# o1/o3 thoughts parts: {"summary": "...", "content": "..."}
|
||||
text_parts.append(part["content"])
|
||||
elif part_type:
|
||||
# Image, file, or other binary attachment — skip and warn
|
||||
logger.warning(
|
||||
"[chatgpt] Skipping %s attachment in conversation %s "
|
||||
"— rich content not yet supported (see FUTURE.md)",
|
||||
part_type,
|
||||
conv_id[:8],
|
||||
)
|
||||
else:
|
||||
text_parts.append(part.get("text", ""))
|
||||
|
||||
return "\n".join(t for t in text_parts if t)
|
||||
|
||||
Reference in New Issue
Block a user