fix for project files not extracted
This commit is contained in:
@@ -622,12 +622,52 @@ def _extract_messages(
|
|||||||
if role in ("user", "assistant"):
|
if role in ("user", "assistant"):
|
||||||
content_obj = msg_data.get("content", {})
|
content_obj = msg_data.get("content", {})
|
||||||
content_type = content_obj.get("content_type", "text")
|
content_type = content_obj.get("content_type", "text")
|
||||||
text = _extract_text(content_obj, conv_id, node_id)
|
ts = msg_data.get("create_time")
|
||||||
|
|
||||||
# model_editable_context carries project instructions as plain text parts
|
# Content types whose parts[] contain plain text strings.
|
||||||
_TEXT_EXTRACTABLE = {"text", "model_editable_context"}
|
# model_editable_context / user_editable_context = project instructions
|
||||||
|
# thoughts / reasoning_recap = o1/o3 reasoning traces
|
||||||
|
_TEXT_PARTS_TYPES = {
|
||||||
|
"text",
|
||||||
|
"model_editable_context",
|
||||||
|
"user_editable_context",
|
||||||
|
"thoughts",
|
||||||
|
"reasoning_recap",
|
||||||
|
}
|
||||||
|
|
||||||
if content_type not in _TEXT_EXTRACTABLE:
|
if content_type in _TEXT_PARTS_TYPES:
|
||||||
|
text = _extract_text(content_obj, conv_id, node_id)
|
||||||
|
if text:
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": role,
|
||||||
|
"content": text,
|
||||||
|
"content_type": "text",
|
||||||
|
"timestamp": _ts_to_iso(ts) if ts else None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"[chatgpt] Skipping empty %s message in conversation %s",
|
||||||
|
content_type,
|
||||||
|
conv_id[:8],
|
||||||
|
)
|
||||||
|
elif content_type == "code":
|
||||||
|
# Inline code response — extract and wrap in a fenced code block
|
||||||
|
code_text = content_obj.get("text") or "\n".join(
|
||||||
|
p for p in content_obj.get("parts", []) if isinstance(p, str)
|
||||||
|
)
|
||||||
|
language = content_obj.get("language", "")
|
||||||
|
if code_text:
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": role,
|
||||||
|
"content": f"```{language}\n{code_text}\n```",
|
||||||
|
"content_type": "code",
|
||||||
|
"timestamp": _ts_to_iso(ts) if ts else None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"[chatgpt] Skipping %s content in conversation %s message %s "
|
"[chatgpt] Skipping %s content in conversation %s message %s "
|
||||||
"— rich content not yet supported (see FUTURE.md)",
|
"— rich content not yet supported (see FUTURE.md)",
|
||||||
@@ -635,20 +675,6 @@ def _extract_messages(
|
|||||||
conv_id[:8],
|
conv_id[:8],
|
||||||
node_id[:8],
|
node_id[:8],
|
||||||
)
|
)
|
||||||
elif text:
|
|
||||||
ts = msg_data.get("create_time")
|
|
||||||
messages.append(
|
|
||||||
{
|
|
||||||
"role": role,
|
|
||||||
"content": text,
|
|
||||||
"content_type": "text",
|
|
||||||
"timestamp": _ts_to_iso(ts) if ts else None,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
"[chatgpt] Skipping empty message in conversation %s", conv_id[:8]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Walk children in order (ChatGPT typically has one child per node in a linear chat)
|
# Walk children in order (ChatGPT typically has one child per node in a linear chat)
|
||||||
for child_id in node.get("children", []):
|
for child_id in node.get("children", []):
|
||||||
@@ -673,7 +699,13 @@ def _find_root(mapping: dict[str, Any]) -> str | None:
|
|||||||
|
|
||||||
|
|
||||||
def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
|
def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
|
||||||
"""Extract plain text from a ChatGPT content object."""
|
"""Extract plain text from a ChatGPT content object.
|
||||||
|
|
||||||
|
Handles three part shapes:
|
||||||
|
- str — plain text (most messages)
|
||||||
|
- dict with content_type="text" — wrapped text part
|
||||||
|
- dict with "content" key — o1/o3 thoughts/reasoning parts
|
||||||
|
"""
|
||||||
parts = content_obj.get("parts", [])
|
parts = content_obj.get("parts", [])
|
||||||
if not parts:
|
if not parts:
|
||||||
return ""
|
return ""
|
||||||
@@ -683,16 +715,19 @@ def _extract_text(content_obj: dict, conv_id: str, node_id: str) -> str:
|
|||||||
if isinstance(part, str):
|
if isinstance(part, str):
|
||||||
text_parts.append(part)
|
text_parts.append(part)
|
||||||
elif isinstance(part, dict):
|
elif isinstance(part, dict):
|
||||||
# Could be an image or file reference — skip and warn
|
part_type = part.get("content_type", "")
|
||||||
part_type = part.get("content_type", "unknown")
|
if part_type == "text":
|
||||||
if part_type != "text":
|
text_parts.append(part.get("text", ""))
|
||||||
|
elif "content" in part:
|
||||||
|
# o1/o3 thoughts parts: {"summary": "...", "content": "..."}
|
||||||
|
text_parts.append(part["content"])
|
||||||
|
elif part_type:
|
||||||
|
# Image, file, or other binary attachment — skip and warn
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"[chatgpt] Skipping %s attachment in conversation %s "
|
"[chatgpt] Skipping %s attachment in conversation %s "
|
||||||
"— rich content not yet supported (see FUTURE.md)",
|
"— rich content not yet supported (see FUTURE.md)",
|
||||||
part_type,
|
part_type,
|
||||||
conv_id[:8],
|
conv_id[:8],
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
text_parts.append(part.get("text", ""))
|
|
||||||
|
|
||||||
return "\n".join(t for t in text_parts if t)
|
return "\n".join(t for t in text_parts if t)
|
||||||
|
|||||||
Reference in New Issue
Block a user