diff --git a/.env.example b/.env.example index 97b0a63..24756a6 100644 --- a/.env.example +++ b/.env.example @@ -10,6 +10,13 @@ # Token type: JWT (starts with "eyJ"). Typically valid for ~7 days. CHATGPT_SESSION_TOKEN= +# ChatGPT Projects (optional): comma-separated list of project gizmo IDs. +# Project conversations are NOT included in the default /conversations listing. +# How to find: open chatgpt.com → click a Project → look at the browser URL: +# https://chatgpt.com/g/g-p--/project → copy "g-p-" +# Example: CHATGPT_PROJECT_IDS=g-p-68c2b2b3037c8191890036fb4ae3ed9f,g-p-anotherproject +CHATGPT_PROJECT_IDS= + # --- Claude --- # How to get: open claude.ai in Chrome → F12 → Application tab # → Cookies → https://claude.ai → find "sessionKey" → copy Value @@ -26,6 +33,18 @@ EXPORT_DIR=./exports # provider/year → exports/claude/2024/file.md (ignores projects) OUTPUT_STRUCTURE=provider/project/year +# --- Joplin --- +# Automate importing exported conversations into Joplin as notes. +# Requires Joplin desktop running with the Web Clipper service enabled. +# How to get the token: +# Joplin → Tools → Options → Web Clipper → copy "Authorization token" +JOPLIN_API_TOKEN= +# API URL (default port is 41184; change only if you've customised it) +JOPLIN_API_URL=http://localhost:41184 +# Request timeout in seconds (default: 30). Increase if Joplin times out on +# large conversations. Example: JOPLIN_REQUEST_TIMEOUT=60 +# JOPLIN_REQUEST_TIMEOUT=30 + # --- Cache --- # Where the sync manifest and logs are stored (default: ~/.ai-chat-exporter) CACHE_DIR=~/.ai-chat-exporter diff --git a/.gitignore b/.gitignore index 064d8a2..f0e2e6f 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ logs/ *.swp *.swo Thumbs.db + +# HTTP traffic captures — may contain auth cookies and session tokens +*.har diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ae6ba7..bc76f7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ All notable changes to this project will be documented here. Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.2.0] - Unreleased +### Added +- Joplin import automation: `joplin` command syncs exported Markdown files to Joplin as notes +- Notebooks created automatically per provider+project (`ChatGPT - My Project`, etc.) +- Re-running is safe: notes are updated, not duplicated (Joplin note ID stored in manifest) +- `JOPLIN_API_TOKEN`, `JOPLIN_API_URL`, `JOPLIN_REQUEST_TIMEOUT` config variables +- Configurable request timeout with clear error messages and actionable hints on timeout +- `--project` filter on `export` and `list` commands (case-insensitive substring or `none`) +- ChatGPT Projects support via `CHATGPT_PROJECT_IDS` env var + ## [0.1.0] - Unreleased ### Added - Initial implementation: ChatGPT and Claude export via internal web APIs diff --git a/FUTURE.md b/FUTURE.md index dbd0c29..5161bbf 100644 --- a/FUTURE.md +++ b/FUTURE.md @@ -1,9 +1,17 @@ # Planned Future Work -These items are explicitly out of scope for v0.1.0 but have been designed for. -The codebase is structured to make each of these additions straightforward. +Items completed in each release are moved to the changelog. Items here are +designed for but not yet implemented. The codebase is structured to make each +of these additions straightforward. + +**Completed:** +- v0.1.0 — Core export: ChatGPT + Claude, incremental sync, Markdown + JSON output +- v0.2.0 — Joplin import automation (`joplin` command, create/update notes, notebook auto-creation) + +--- + +## Export `--force` Flag (v0.2.x) -## Export --force Flag (v0.1.x) Add `--force` to the `export` command to re-export already-cached conversations without permanently clearing the entire manifest. Useful for re-generating files after changing the Markdown template or output structure. @@ -13,30 +21,27 @@ returns all conversations regardless of cache state when force is True. Current workaround: `python -m src.main cache --clear` then re-run export. -## Joplin Integration (v0.2.0) -Automate importing exported Markdown files into Joplin as new notes. -Joplin exposes a local REST API (requires Joplin desktop running with Web Clipper enabled). +## Joplin `--force` Flag (v0.2.x) -Approach: after export, iterate exported files and POST each to -`http://localhost:41184/notes` with the appropriate notebook ID. +Similarly, add `--force` to the `joplin` command to re-sync all cached +conversations to Joplin regardless of whether they've been synced before. +Useful after making formatting changes to the Markdown exporter. -The output folder structure maps directly to Joplin notebooks: -- exports/chatgpt/my-project/ → Joplin notebook "ChatGPT - My Project" -- exports/claude/my-project/ → Joplin notebook "Claude - My Project" -- exports/chatgpt/no-project/ → Joplin notebook "ChatGPT - No Project" -- exports/claude/no-project/ → Joplin notebook "Claude - No Project" +Implementation: in `get_joplin_pending()`, return all entries that have a +`file_path` when `force=True`, ignoring `joplin_synced_at`. -Prerequisites: -- Joplin desktop must be running with Web Clipper enabled -- `JOPLIN_API_TOKEN` env var (get from Joplin → Tools → Web Clipper Options) -- The Joplin import script will need to create notebooks if they don't exist, - then POST each note into the correct notebook +## Per-Conversation Cache Reset (v0.2.x) -Note: The default OUTPUT_STRUCTURE of provider/project/year is assumed when -implementing the import script. If the user has changed OUTPUT_STRUCTURE, -the import script will need updating accordingly. +Add `cache --reset --conversation ` to force re-export or re-sync of a +single conversation without clearing the entire provider cache. + +Current workaround: manually edit `~/.ai-chat-exporter/manifest.json` and +delete the entry, then re-run export. + +--- + +## Official API Fallback (v0.3.0) -## Official API Migration (v0.3.0) If the unofficial internal web API approach breaks, migrate to official export file parsing as a fallback: - ChatGPT: parse `conversations.json` from Settings → Export Data @@ -44,14 +49,17 @@ file parsing as a fallback: The `BaseProvider` abstract class is intentionally designed so that a `FileProvider` subclass can implement the same interface -(list_conversations, get_conversation, normalize_conversation) +(`list_conversations`, `get_conversation`, `normalize_conversation`) without any changes to cache, exporters, or CLI code. To add this: implement `src/providers/file_chatgpt.py` and `src/providers/file_claude.py`, then add `--input-file` flag to the export command to accept a pre-downloaded export ZIP or JSON. +--- + ## Rich Content Support (v0.4.0) + Currently only text content is exported. Future versions should handle: ### Claude @@ -68,5 +76,88 @@ Currently only text content is exported. Future versions should handle: Implementation note: the normalized message schema already includes a `content_type` field placeholder. When this work begins, extend the schema -rather than replacing it. In v0.1.0, log a WARNING whenever non-text content -is encountered so users know what was skipped. +rather than replacing it. Non-text content already logs a WARNING when +encountered so users can see what was skipped. + +--- + +## Scheduled / Watch Mode (v0.5.0) + +Add a `watch` command (or cron integration helper) to run exports automatically +on a schedule: + +```bash +python -m src.main watch --interval 6h # poll every 6 hours +``` + +This would run `export` + `joplin` in sequence, then sleep. Alternatively, +provide a `cron` command that prints the correct crontab line for the user's +setup. + +Implementation: simple loop with `time.sleep()`, or emit a crontab entry +string that calls the export and joplin commands in sequence. A `--once` +flag would do a single run then exit (useful for cron itself). + +--- + +## Obsidian Vault Output (v0.5.0) + +Add an `obsidian` command (or `--target obsidian` flag) to sync exported +conversations into an Obsidian vault directory. The current Markdown format +is already largely compatible; the main differences are: + +- Obsidian uses YAML frontmatter `properties` (same format, already supported) +- Tags should use `#tag` inline or `tags:` list in frontmatter (already done) +- Wikilinks (`[[Title]]`) instead of Markdown links — optional, Obsidian + supports both + +Implementation: the existing `MarkdownExporter` output is already valid in +Obsidian. An `ObsidianSyncer` class (mirroring `JoplinClient`) would simply +copy files to the vault directory and maintain a flat or nested folder +structure matching the user's Obsidian setup. No API needed — just file I/O. + +--- + +## Joplin Nested Notebooks (future) + +Currently notebooks are flat: `ChatGPT - My Project`. Joplin supports nested +notebooks via `parent_id`. A future option (`JOPLIN_NESTED_NOTEBOOKS=true`) +could create a two-level hierarchy: + +``` +ChatGPT/ + My Project/ + No Project/ +Claude/ + Budget Tracker/ +``` + +Implementation: `get_or_create_notebook` would first find/create the provider +notebook, then find/create the project notebook as a child. + +--- + +## Token Expiry Notifications (future) + +Proactively warn when a token is close to expiry (within 48h for ChatGPT), +rather than only surfacing the warning at startup. Options: + +- Add an `expiry` subcommand that prints token status and exits non-zero if + any token is expired or expiring soon (useful in scripts/cron) +- Send a desktop notification via `notify-send` (Linux) or `osascript` (macOS) + when a token is within 24h of expiry + +--- + +## Search Command (future) + +Add a `search` command to full-text search across all exported Markdown files: + +```bash +python -m src.main search "kubernetes ingress" +python -m src.main search "kubernetes ingress" --provider claude --project devops +``` + +Implementation: `grep`/`ripgrep` over `EXPORT_DIR`, display results with +conversation title, date, and a snippet. No index needed — Markdown files are +small enough to grep directly. diff --git a/README.md b/README.md index 897fade..2b68716 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # AI Chat Exporter -A personal backup tool for ChatGPT and Claude conversation history. Exports your chats to Markdown files structured for archival in [Joplin](https://joplinapp.org/). Each conversation becomes a single `.md` file with YAML frontmatter, organised into folders that map directly to Joplin notebooks. +A personal backup tool for ChatGPT and Claude conversation history. Exports your chats to Markdown files and syncs them to [Joplin](https://joplinapp.org/) as notes. Each conversation becomes a single `.md` file with YAML frontmatter, organised into folders that map directly to Joplin notebooks. Supports incremental sync — only new or updated conversations are exported on each run. Every run is resumable: if interrupted, re-running picks up exactly where it left off. @@ -101,20 +101,62 @@ Copy `.env.example` to `.env` and fill in your values: cp .env.example .env ``` +### Provider tokens + +| Variable | Description | +|----------|-------------| +| `CHATGPT_SESSION_TOKEN` | Your ChatGPT JWT session token (`eyJ…`) | +| `CHATGPT_PROJECT_IDS` | Comma-separated ChatGPT project IDs (see below) | +| `CLAUDE_SESSION_KEY` | Your Claude session key | + +### Output + | Variable | Default | Description | |----------|---------|-------------| -| `CHATGPT_SESSION_TOKEN` | — | Your ChatGPT JWT session token | -| `CLAUDE_SESSION_KEY` | — | Your Claude session key | -| `EXPORT_DIR` | `./exports` | Where to write exported files | +| `EXPORT_DIR` | `./exports` | Where to write exported Markdown files | | `OUTPUT_STRUCTURE` | `provider/project/year` | Folder structure (see below) | + +### Joplin + +| Variable | Default | Description | +|----------|---------|-------------| +| `JOPLIN_API_TOKEN` | — | Authorization token from Joplin Web Clipper settings | +| `JOPLIN_API_URL` | `http://localhost:41184` | Joplin API URL (change only if you've customised the port) | +| `JOPLIN_REQUEST_TIMEOUT` | `30` | Seconds before an API call times out. Increase for very large conversations. | + +### Cache & logging + +| Variable | Default | Description | +|----------|---------|-------------| | `CACHE_DIR` | `~/.ai-chat-exporter` | Where to store the sync manifest | | `LOG_FILE` | `~/.ai-chat-exporter/logs/exporter.log` | Log file path (`none` to disable) | --- +## ChatGPT Projects + +ChatGPT project conversations are stored separately from your main conversation list and require extra configuration. + +### Finding your project IDs + +1. Open ChatGPT and click a Project in the left sidebar +2. Look at the browser URL — it will look like: + `https://chatgpt.com/g/g-p-68c2b2b3037c8191890036fb4ae3ed9f-my-project/project` +3. Copy the `g-p-…` part (everything up to but not including the slug after the second `-`) + +Add all your project IDs to `.env` as a comma-separated list: + +``` +CHATGPT_PROJECT_IDS=g-p-68c2b2b3037c8191890036fb4ae3ed9f,g-p-anotherprojectid +``` + +The `auth` wizard can also guide you through this step interactively. + +--- + ## Output Structure -All exported files go under `EXPORT_DIR`. The structure maps to Joplin notebooks. +All exported files go under `EXPORT_DIR`. The folder structure maps directly to Joplin notebooks. ### Default: `provider/project/year` @@ -136,7 +178,9 @@ exports/ └── 2024-06-10_manifest-setup_jkl22222.md ``` -### Joplin Notebook Mapping (for future automated import) +### Joplin Notebook Mapping + +Each provider+project combination maps to a flat Joplin notebook created automatically by the `joplin` command: | Export folder | Joplin notebook | |---------------|-----------------| @@ -177,7 +221,7 @@ exports/ python -m src.main auth ``` -Guided wizard to find and save session tokens. Detects OS and shows the correct DevTools shortcut. +Guided wizard to find and save session tokens and ChatGPT project IDs. Detects OS and shows the correct DevTools shortcut. ### `doctor` — Health check @@ -205,6 +249,12 @@ python -m src.main export --format both # Only conversations updated since a date python -m src.main export --since 2024-06-01 +# Only conversations in a specific project (case-insensitive substring) +python -m src.main export --project "learning python" + +# Only conversations outside any project +python -m src.main export --project none + # Write to a custom directory python -m src.main export --output /path/to/my/notes @@ -212,15 +262,54 @@ python -m src.main export --output /path/to/my/notes python -m src.main export --dry-run ``` -Options: `--provider [chatgpt|claude|all]`, `--format [markdown|json|both]`, `--output PATH`, `--since YYYY-MM-DD`, `--dry-run` +Options: `--provider [chatgpt|claude|all]`, `--format [markdown|json|both]`, `--output PATH`, `--since YYYY-MM-DD`, `--project NAME`, `--dry-run` ### `list` — List conversations ```bash +# List all conversations for all providers +python -m src.main list + +# Single provider python -m src.main list --provider chatgpt + +# Filter by project +python -m src.main list --project "learning python" + +# Only conversations outside any project +python -m src.main list --project none ``` -Fetches and displays all conversations without exporting them. +Fetches and displays all conversations without exporting them. Useful for verifying what the tool can see before running an export. + +### `joplin` — Sync to Joplin + +```bash +# Sync all pending conversations to Joplin +python -m src.main joplin + +# Preview what would be synced without sending anything +python -m src.main joplin --dry-run + +# Sync a single provider +python -m src.main joplin --provider chatgpt + +# Sync only conversations in a specific project +python -m src.main joplin --project "learning python" + +# Sync only conversations outside any project +python -m src.main joplin --project none +``` + +Reads the local export cache and pushes each exported Markdown file to Joplin as a note. Notebooks are created automatically. Re-running is safe — notes are updated (not duplicated). + +**Prerequisites:** +1. Run `export` first to generate the Markdown files +2. Open Joplin → Tools → Options → Web Clipper → enable the service +3. Copy the Authorization token and add `JOPLIN_API_TOKEN=` to your `.env` +4. Joplin desktop must be open when you run this command + +Options: `--provider [chatgpt|claude|all]`, `--project NAME`, `--dry-run` ### `cache` — Manage the sync manifest @@ -239,15 +328,20 @@ python -m src.main cache --clear --provider claude ## How the Cache Works -The cache manifest lives at `~/.ai-chat-exporter/manifest.json` and records every exported conversation: its title, project, `updated_at` timestamp, and output file path. +The cache manifest lives at `~/.ai-chat-exporter/manifest.json` and records every exported conversation: its title, project, `updated_at` timestamp, output file path, and (after Joplin sync) the Joplin note ID. -On every run: +On every `export` run: 1. Fetch the full conversation list from the provider 2. Compare each conversation's `updated_at` against the manifest 3. Export only conversations that are new or have been updated 4. Write each successfully exported conversation to the manifest **immediately** (not batched) -**This design makes every run inherently resumable.** If the tool is interrupted for any reason — rate limit, network drop, Ctrl+C, crash — simply re-run the same command. It will skip already-exported conversations and continue from where it stopped. +On every `joplin` run: +1. Read the manifest to find conversations not yet synced to Joplin, or re-exported since last sync +2. Push each pending Markdown file to Joplin (create or update) +3. Store the Joplin note ID in the manifest so subsequent runs update rather than duplicate + +**This design makes every run inherently resumable.** If the tool is interrupted for any reason — rate limit, network drop, Ctrl+C, crash — simply re-run the same command. It will skip already-processed conversations and continue from where it stopped. To force a full re-export: `python -m src.main cache --clear` then re-run export. @@ -265,11 +359,36 @@ Note: Claude's `sessionKey` is an opaque string — the only way to know it's ex ### `429 Rate Limited` The tool automatically pauses, saves progress, and exits with a clear message showing how many conversations were exported vs remaining. Just re-run the same export command to resume — the cache picks up exactly where it left off. +### Joplin: "JOPLIN_API_TOKEN is not set" +You need to configure the token before running the `joplin` command: +1. Open Joplin desktop +2. Go to Tools → Options → Web Clipper +3. Enable the Web Clipper service +4. Copy the Authorization token shown on that page +5. Add `JOPLIN_API_TOKEN=` to your `.env` file + +### Joplin: "Joplin is not responding" +Joplin desktop must be running when you run the `joplin` command. The Web Clipper service shuts down when Joplin is closed. + +### Joplin: "Joplin rejected the API token (HTTP 401)" +The token in `JOPLIN_API_TOKEN` doesn't match what Joplin expects. Get a fresh token from Joplin → Tools → Options → Web Clipper → Authorization token. + +### Joplin: note timed out +If you see a timeout error, Joplin took longer than `JOPLIN_REQUEST_TIMEOUT` seconds (default: 30) to respond. Possible causes: +- The conversation is very large and Joplin is slow to index it +- Joplin is busy syncing or loading a large library +- Joplin has frozen — try restarting it + +To increase the timeout: add `JOPLIN_REQUEST_TIMEOUT=60` to your `.env`. + +### ChatGPT project conversations not appearing +Make sure you've added the project IDs to `CHATGPT_PROJECT_IDS` in your `.env`. See [ChatGPT Projects](#chatgpt-projects) for how to find them. Project conversations are not included in the default conversation listing — they must be fetched separately. + ### Schema warnings in logs (`Unexpected API response shape`) The provider's internal API may have changed. Run with `--debug`, sanitize the output (remove any personal content), and check the project's GitHub Issues for known fixes. ### Non-text content warnings -Images, code interpreter outputs, DALL-E generations, and Claude artifacts are not exported in v0.1.0. A WARNING is logged for each skipped item. See `FUTURE.md` for the v0.4.0 roadmap. +Images, code interpreter outputs, DALL-E generations, and Claude artifacts are not exported in v0.2.0. A WARNING is logged for each skipped item. See `FUTURE.md` for the roadmap. ### Empty export / all conversations skipped No new or updated conversations since your last run. To verify: `python -m src.main cache --show`. To force a full re-export: `python -m src.main cache --clear`. @@ -285,17 +404,18 @@ No new or updated conversations since your last run. To verify: `python -m src.m See `FUTURE.md` for planned features: -- **v0.1.x** — `export --force` flag to bypass cache for a single run -- **v0.2.0** — Joplin integration: auto-import exported files via Joplin's local REST API +- **v0.2.x** — `export --force` flag; `joplin --force` flag; per-conversation cache reset - **v0.3.0** — Official API fallback: parse export ZIP files from ChatGPT/Claude settings - **v0.4.0** — Rich content: images, artifacts, code interpreter output, extended thinking +- **v0.5.0** — Watch/scheduled mode; Obsidian vault output --- ## Security Notes -- All exported data is stored **locally only** — nothing is sent anywhere +- All exported data is stored **locally only** — nothing is sent anywhere except to your local Joplin instance - Exported files and the cache manifest are created with `600` permissions (owner read/write only) - `.env` is in `.gitignore` — **never commit it** - Session tokens are never logged, printed, or included in error messages +- The Joplin API token is only ever sent to `localhost` — it never leaves your machine - If you accidentally commit `.env`: immediately log out and back in to invalidate the token, then remove it from git history using [BFG Repo Cleaner](https://rtyley.github.io/bfg-repo-cleaner/) or `git filter-branch` diff --git a/debug_auth.py b/debug_auth.py deleted file mode 100644 index 1a8374c..0000000 --- a/debug_auth.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Debug script — checks what /api/auth/session returns using curl_cffi Chrome impersonation.""" -import os -from dotenv import load_dotenv -from curl_cffi import requests as curl_requests - -load_dotenv() -token = os.getenv("CHATGPT_SESSION_TOKEN") -if not token: - print("ERROR: CHATGPT_SESSION_TOKEN not found in .env") - raise SystemExit(1) - -s = curl_requests.Session(impersonate="chrome120") -s.cookies.set("__Secure-next-auth.session-token", token, domain="chatgpt.com", path="/") -s.headers.update({ - "Referer": "https://chatgpt.com/", - "Accept": "*/*", - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "same-origin", -}) - -print("Calling /api/auth/session (with Chrome TLS impersonation) ...") -r = s.get("https://chatgpt.com/api/auth/session", timeout=15) -print(f"Status: {r.status_code}") -print(f"Content-Type: {r.headers.get('content-type', '(none)')}") - -try: - data = r.json() - print(f"Top-level keys: {list(data.keys())}") - access_token = data.get("accessToken") - if access_token: - print(f"accessToken: FOUND (length={len(access_token)}, starts with '{access_token[:10]}...')") - else: - print("accessToken: NOT FOUND in response") - print(f"Full response body:\n{r.text}") -except Exception as e: - print(f"Could not parse JSON: {e}\nRaw body:\n{r.text[:500]}") diff --git a/debug_claude.py b/debug_claude.py deleted file mode 100644 index ed653b2..0000000 --- a/debug_claude.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Debug script — tests Claude API connectivity using curl_cffi Chrome impersonation.""" -import os -from dotenv import load_dotenv -from curl_cffi import requests as curl_requests - -load_dotenv() -key = os.getenv("CLAUDE_SESSION_KEY") -if not key: - print("ERROR: CLAUDE_SESSION_KEY not found in .env") - raise SystemExit(1) - -s = curl_requests.Session(impersonate="chrome120") -s.cookies.set("sessionKey", key, domain="claude.ai", path="/") -s.headers.update({ - "Referer": "https://claude.ai/", - "Accept": "application/json", -}) - -print("Calling /api/organizations (with Chrome TLS impersonation) ...") -r = s.get("https://claude.ai/api/organizations", timeout=15) -print(f"Status: {r.status_code}") -print(f"Response (first 400 chars): {r.text[:400]}") diff --git a/pyproject.toml b/pyproject.toml index fe20338..f0ca2a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ai-chat-exporter" -version = "0.1.0" +version = "0.2.0" description = "Export ChatGPT and Claude conversation history to Markdown for personal archival in Joplin" requires-python = ">=3.11" dependencies = [ diff --git a/src/cache.py b/src/cache.py index 0036240..315ecdb 100644 --- a/src/cache.py +++ b/src/cache.py @@ -1,4 +1,4 @@ -"""Local cache manifest for tracking exported conversations.""" +"""Local cache manifest for tracking exported and Joplin-synced conversations.""" import json import logging @@ -18,11 +18,17 @@ class CacheError(Exception): class Cache: - """Manages the local JSON manifest of exported conversations. + """Manages the local JSON manifest of exported and Joplin-synced conversations. - The manifest is the single source of truth for what has been exported. - Every run compares the provider's full conversation list against this - manifest to determine what is new or updated. + The manifest is the single source of truth for what has been exported and + synced. Every export run compares the provider's full conversation list + against this manifest to determine what is new or updated. The Joplin sync + run reads it to find conversations not yet pushed to Joplin (or re-exported + since the last sync). + + Each entry tracks: + title, project, updated_at, exported_at, file_path, + joplin_note_id (after first sync), joplin_synced_at (after first sync) File security: - Permissions: 600 (owner read/write only) @@ -150,6 +156,59 @@ class Cache: """Return all cached entries for a provider (for --cache --show).""" return dict(self._data.get(provider, {})) + def mark_joplin_synced(self, provider: str, conv_id: str, note_id: str) -> None: + """Record a successful Joplin sync for a conversation. + + Adds ``joplin_note_id`` and ``joplin_synced_at`` to the manifest entry + and writes atomically to disk. + """ + entry = self._data.get(provider, {}).get(conv_id) + if entry is None: + logger.warning( + "[cache] mark_joplin_synced: no cache entry for %s/%s", provider, conv_id[:8] + ) + return + entry["joplin_note_id"] = note_id + entry["joplin_synced_at"] = datetime.now(tz=timezone.utc).isoformat() + self._save() + + def get_joplin_pending(self, provider: str) -> list[tuple[str, dict]]: + """Return (conv_id, entry) pairs that need to be synced to Joplin. + + A conversation is pending when: + - It has never been synced (no ``joplin_note_id``), OR + - It was re-exported after the last Joplin sync + (``exported_at`` > ``joplin_synced_at``). + + Returns: + List of (conv_id, entry_dict) tuples, where entry_dict includes + ``file_path``, ``title``, ``project``, and optionally ``joplin_note_id``. + """ + pending = [] + for conv_id, entry in self._data.get(provider, {}).items(): + if not isinstance(entry, dict): + continue + if not entry.get("file_path"): + continue + + note_id = entry.get("joplin_note_id") + if not note_id: + pending.append((conv_id, entry)) + continue + + # Re-sync if the file was re-exported after the last Joplin sync + exported_at = entry.get("exported_at", "") + synced_at = entry.get("joplin_synced_at", "") + if exported_at and synced_at: + try: + from src.utils import _parse_dt + if _parse_dt(exported_at) > _parse_dt(synced_at): + pending.append((conv_id, entry)) + except Exception: + pass + + return pending + def last_run(self) -> str | None: """Return the ISO8601 timestamp of the last export run, or None.""" return self._data.get("last_run") diff --git a/src/config.py b/src/config.py index 7085523..daffcb0 100644 --- a/src/config.py +++ b/src/config.py @@ -35,6 +35,13 @@ class Config: log_file: str # Decoded ChatGPT JWT expiry (None if token absent or not a JWT) chatgpt_token_expiry: datetime | None = field(default=None, repr=False) + # ChatGPT Project gizmo IDs (g-p-xxx) — project conversations are not + # included in the default /conversations listing; they must be fetched + # separately via /backend-api/gizmos/{id}/conversations. + chatgpt_project_ids: list[str] = field(default_factory=list) + # Joplin local REST API settings (Web Clipper service) + joplin_api_token: str | None = None + joplin_api_url: str = "http://localhost:41184" def load_config() -> Config: @@ -54,6 +61,24 @@ def load_config() -> Config: cache_dir = Path(os.getenv("CACHE_DIR", "~/.ai-chat-exporter")).expanduser() log_file = os.getenv("LOG_FILE", "~/.ai-chat-exporter/logs/exporter.log").strip() + # Joplin + joplin_token = os.getenv("JOPLIN_API_TOKEN", "").strip() or None + joplin_url = os.getenv("JOPLIN_API_URL", "http://localhost:41184").strip() + + # Parse CHATGPT_PROJECT_IDS — comma-separated list of gizmo IDs (g-p-xxx) + _project_ids_raw = os.getenv("CHATGPT_PROJECT_IDS", "").strip() + chatgpt_project_ids = [ + pid.strip() + for pid in _project_ids_raw.split(",") + if pid.strip() and pid.strip().startswith("g-p-") + ] if _project_ids_raw else [] + if _project_ids_raw and not chatgpt_project_ids: + logger.warning( + "CHATGPT_PROJECT_IDS is set but contains no valid project IDs. " + "Each ID should start with 'g-p-' (e.g. g-p-68c2b2b3037c8191890036fb4ae3ed9f). " + "Find your project ID in the browser URL when viewing a project." + ) + errors: list[str] = [] # Validate output structure @@ -108,6 +133,9 @@ def load_config() -> Config: cache_dir=cache_dir, log_file=log_file, chatgpt_token_expiry=chatgpt_expiry, + chatgpt_project_ids=chatgpt_project_ids, + joplin_api_token=joplin_token, + joplin_api_url=joplin_url, ) _log_startup_summary(config) @@ -182,16 +210,21 @@ def _log_startup_summary(cfg: Config) -> None: """Log a single INFO line summarising the active configuration.""" chatgpt_status = format_token_status(cfg.chatgpt_session_token, cfg.chatgpt_token_expiry) claude_status = format_token_status(cfg.claude_session_key) + joplin_status = "configured" if cfg.joplin_api_token else "not configured" logger.info( "Config loaded | " "ChatGPT: %s | " "Claude: %s | " + "chatgpt_projects: %d | " + "Joplin: %s | " "export_dir=%s | " "structure=%s | " "cache_dir=%s", chatgpt_status, claude_status, + len(cfg.chatgpt_project_ids), + joplin_status, cfg.export_dir, cfg.output_structure, cfg.cache_dir, diff --git a/src/joplin.py b/src/joplin.py new file mode 100644 index 0000000..9c844d4 --- /dev/null +++ b/src/joplin.py @@ -0,0 +1,303 @@ +"""Joplin Data API client for importing notes into Joplin desktop.""" + +import logging +import os +from typing import Any + +import requests + +logger = logging.getLogger(__name__) + +# HTTP timeout for regular API calls (seconds). Notes can be large Markdown +# files so we allow more time than a typical JSON API call. +# Override with JOPLIN_REQUEST_TIMEOUT env var if you have very large conversations. +_REQUEST_TIMEOUT: int = int(os.getenv("JOPLIN_REQUEST_TIMEOUT", "30")) + + +class JoplinError(Exception): + """Raised when the Joplin API returns an error or is unreachable.""" + + +class JoplinClient: + """HTTP client for the Joplin local REST API (Web Clipper service). + + Requires Joplin desktop to be running with the Web Clipper service enabled. + Get your API token from: Joplin → Tools → Options → Web Clipper. + + Args: + base_url: Joplin API base URL (default: http://localhost:41184). + token: API authorization token from Joplin Web Clipper settings. + """ + + def __init__(self, base_url: str, token: str) -> None: + self._base_url = base_url.rstrip("/") + self._token = token + # In-memory cache of notebook title → ID to avoid repeated GET /folders + self._notebook_cache: dict[str, str] = {} + self._notebooks_loaded = False + logger.debug("[joplin] Client initialised with base_url=%s", self._base_url) + + # ------------------------------------------------------------------ + # Connectivity + # ------------------------------------------------------------------ + + def ping(self) -> bool: + """Return True if the Joplin API is reachable and responding. + + Note: /ping does not require authentication. A successful ping only + confirms Joplin is running — not that the token is valid. Call + ``validate_token()`` to confirm authentication separately. + + Raises: + JoplinError: If the API returns an unexpected non-connection error. + """ + url = f"{self._base_url}/ping" + logger.debug("[joplin] GET %s", url) + try: + resp = requests.get(url, timeout=5) + resp.raise_for_status() + ok = "JoplinClipperServer" in resp.text + logger.debug("[joplin] ping → %s (body: %r)", "OK" if ok else "unexpected response", resp.text[:80]) + return ok + except requests.exceptions.ConnectionError: + logger.debug("[joplin] ping → connection refused at %s", url) + return False + except requests.exceptions.Timeout: + logger.debug("[joplin] ping → timed out after 5s at %s", url) + return False + except requests.exceptions.RequestException as e: + raise JoplinError(f"Joplin ping failed: {e}") from e + + def validate_token(self) -> None: + """Verify the API token is accepted by Joplin. + + Does a minimal authenticated call (GET /folders?limit=1) and raises + ``JoplinError`` if authentication fails. + + Raises: + JoplinError: If the token is rejected (401) or Joplin is unreachable. + """ + logger.debug("[joplin] Validating API token…") + self._get("/folders", params={"limit": 1, "fields": "id"}) + logger.debug("[joplin] Token validated OK") + + # ------------------------------------------------------------------ + # Notebooks (folders) + # ------------------------------------------------------------------ + + def list_notebooks(self) -> list[dict]: + """Return all Joplin notebooks (folders), handling pagination. + + Returns: + List of folder dicts with at least ``id`` and ``title`` keys. + """ + results: list[dict] = [] + page = 1 + while True: + logger.debug("[joplin] GET /folders page=%d", page) + resp = self._get("/folders", params={"page": page, "fields": "id,title"}) + items = resp.get("items", []) + results.extend(items) + logger.debug("[joplin] /folders page=%d → %d items, has_more=%s", page, len(items), resp.get("has_more")) + if not resp.get("has_more"): + break + page += 1 + return results + + def get_or_create_notebook(self, title: str) -> str: + """Return the Joplin folder ID for ``title``, creating it if needed. + + Args: + title: Notebook display name (e.g. "ChatGPT - My Project"). + + Returns: + Joplin folder ID string. + """ + if not self._notebooks_loaded: + self._load_notebook_cache() + + if title in self._notebook_cache: + folder_id = self._notebook_cache[title] + logger.debug("[joplin] Notebook cache hit: %r → %s", title, folder_id) + return folder_id + + # Not found — create it + logger.info("[joplin] Creating notebook: %r", title) + resp = self._post("/folders", {"title": title}) + folder_id = resp["id"] + self._notebook_cache[title] = folder_id + logger.debug("[joplin] Notebook created: %r → %s", title, folder_id) + return folder_id + + # ------------------------------------------------------------------ + # Notes + # ------------------------------------------------------------------ + + def create_note(self, title: str, body: str, parent_id: str) -> str: + """Create a new note in the specified notebook. + + Args: + title: Note title. + body: Note body (Markdown). + parent_id: Notebook (folder) ID. + + Returns: + ID of the created note. + """ + logger.debug( + "[joplin] Creating note: %r in notebook %s (%d chars)", + title, parent_id, len(body), + ) + resp = self._post("/notes", {"title": title, "body": body, "parent_id": parent_id}) + note_id = resp["id"] + logger.info("[joplin] Note created: %r → %s", title, note_id) + return note_id + + def update_note(self, note_id: str, title: str, body: str) -> None: + """Update the title and body of an existing note. + + Args: + note_id: Joplin note ID. + title: New note title. + body: New note body (Markdown). + """ + logger.debug( + "[joplin] Updating note %s: %r (%d chars)", + note_id, title, len(body), + ) + self._put(f"/notes/{note_id}", {"title": title, "body": body}) + logger.info("[joplin] Note updated: %r (%s)", title, note_id) + + # ------------------------------------------------------------------ + # HTTP helpers + # ------------------------------------------------------------------ + + def _get(self, path: str, params: dict | None = None) -> dict[str, Any]: + url = f"{self._base_url}{path}" + query = {"token": self._token, **(params or {})} + logger.debug("[joplin] GET %s params=%s", path, {k: v for k, v in (params or {}).items()}) + try: + resp = requests.get(url, params=query, timeout=_REQUEST_TIMEOUT) + logger.debug("[joplin] GET %s → HTTP %d", path, resp.status_code) + resp.raise_for_status() + return resp.json() + except requests.exceptions.ConnectionError as e: + raise JoplinError( + "Cannot connect to Joplin. Is Joplin desktop running with Web Clipper enabled?" + ) from e + except requests.exceptions.Timeout as e: + raise JoplinError(_timeout_message("GET", path)) from e + except requests.exceptions.HTTPError as e: + raise JoplinError(_http_error_message("GET", path, e)) from e + except requests.exceptions.RequestException as e: + raise JoplinError(f"Joplin GET {path} failed: {e}") from e + + def _post(self, path: str, data: dict) -> dict[str, Any]: + url = f"{self._base_url}{path}" + logger.debug("[joplin] POST %s", path) + try: + resp = requests.post(url, params={"token": self._token}, json=data, timeout=_REQUEST_TIMEOUT) + logger.debug("[joplin] POST %s → HTTP %d", path, resp.status_code) + resp.raise_for_status() + return resp.json() + except requests.exceptions.ConnectionError as e: + raise JoplinError( + "Cannot connect to Joplin. Is Joplin desktop running with Web Clipper enabled?" + ) from e + except requests.exceptions.Timeout as e: + raise JoplinError(_timeout_message("POST", path)) from e + except requests.exceptions.HTTPError as e: + raise JoplinError(_http_error_message("POST", path, e)) from e + except requests.exceptions.RequestException as e: + raise JoplinError(f"Joplin POST {path} failed: {e}") from e + + def _put(self, path: str, data: dict) -> dict[str, Any]: + url = f"{self._base_url}{path}" + logger.debug("[joplin] PUT %s", path) + try: + resp = requests.put(url, params={"token": self._token}, json=data, timeout=_REQUEST_TIMEOUT) + logger.debug("[joplin] PUT %s → HTTP %d", path, resp.status_code) + resp.raise_for_status() + return resp.json() + except requests.exceptions.ConnectionError as e: + raise JoplinError( + "Cannot connect to Joplin. Is Joplin desktop running with Web Clipper enabled?" + ) from e + except requests.exceptions.Timeout as e: + raise JoplinError(_timeout_message("PUT", path)) from e + except requests.exceptions.HTTPError as e: + raise JoplinError(_http_error_message("PUT", path, e)) from e + except requests.exceptions.RequestException as e: + raise JoplinError(f"Joplin PUT {path} failed: {e}") from e + + def _load_notebook_cache(self) -> None: + logger.debug("[joplin] Loading notebook list from Joplin…") + notebooks = self.list_notebooks() + self._notebook_cache = {nb["title"]: nb["id"] for nb in notebooks} + self._notebooks_loaded = True + logger.debug("[joplin] Notebook cache loaded: %d notebooks", len(self._notebook_cache)) + for title, folder_id in self._notebook_cache.items(): + logger.debug("[joplin] %r → %s", title, folder_id) + + +# ------------------------------------------------------------------ +# Error message helper +# ------------------------------------------------------------------ + + +def _timeout_message(method: str, path: str) -> str: + """Build a clear timeout error message with actionable suggestions.""" + return ( + f"Joplin {method} {path} timed out after {_REQUEST_TIMEOUT}s. " + "Possible causes:\n" + " • The note body is very large and Joplin is slow to process it.\n" + " • Joplin is busy (syncing, indexing, or loading a large library).\n" + " • Joplin has frozen — try restarting it.\n" + f"If this happens repeatedly, increase JOPLIN_REQUEST_TIMEOUT in your .env " + f"(currently {_REQUEST_TIMEOUT}s)." + ) + + +def _http_error_message(method: str, path: str, e: requests.exceptions.HTTPError) -> str: + """Build a human-friendly error message from an HTTP error, with auth hint on 401.""" + resp = e.response + status = resp.status_code if resp is not None else "?" + if status == 401: + return ( + f"Joplin rejected the API token (HTTP 401 on {method} {path}). " + "Check that JOPLIN_API_TOKEN is correct: " + "Joplin → Tools → Options → Web Clipper → Authorization token." + ) + if status == 404: + return f"Joplin resource not found (HTTP 404 on {method} {path}). The note may have been deleted in Joplin." + body_snippet = "" + if resp is not None: + try: + body_snippet = f" — {resp.text[:120]}" + except Exception: + pass + return f"Joplin {method} {path} failed: HTTP {status}{body_snippet}" + + +# ------------------------------------------------------------------ +# Notebook naming helper +# ------------------------------------------------------------------ + + +_PROVIDER_DISPLAY = { + "chatgpt": "ChatGPT", + "claude": "Claude", +} + + +def notebook_title(provider: str, project: str | None) -> str: + """Derive a flat Joplin notebook title from provider and project name. + + Examples: + notebook_title("chatgpt", "no-project") → "ChatGPT - No Project" + notebook_title("claude", "budget-tracker") → "Claude - Budget Tracker" + notebook_title("chatgpt", None) → "ChatGPT - No Project" + """ + prov_display = _PROVIDER_DISPLAY.get(provider, provider.capitalize()) + proj = (project or "no-project").replace("-", " ").title() + return f"{prov_display} - {proj}" diff --git a/src/main.py b/src/main.py index 7366a50..0939dae 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,6 @@ """CLI entry point for ai-chat-exporter.""" +import importlib.metadata import logging import platform import shutil @@ -19,6 +20,7 @@ from src.providers.base import ProviderError console = Console() err_console = Console(stderr=True) +logger = logging.getLogger(__name__) TOS_NOTICE = """\ ⚠️ IMPORTANT — TERMS OF SERVICE NOTICE @@ -45,7 +47,10 @@ Type 'yes' to acknowledge and continue, or Ctrl+C to exit: \ @click.group() -@click.version_option(version="0.1.0", prog_name="ai-chat-exporter") +@click.version_option( + version=importlib.metadata.version("ai-chat-exporter"), + prog_name="ai-chat-exporter", +) @click.option("--verbose", "-v", is_flag=True, help="Enable DEBUG output to console.") @click.option("--quiet", "-q", is_flag=True, help="Show WARNING and above only.") @click.option("--debug", is_flag=True, help="DEBUG + full tracebacks + redacted API bodies.") @@ -175,6 +180,39 @@ def _auth_chatgpt(os_name: str) -> None: _write_token_to_env("CHATGPT_SESSION_TOKEN", token) + # --- ChatGPT Projects --- + console.print("\n[bold]ChatGPT Projects (optional)[/bold]") + console.print( + "Project conversations are stored separately and are not included in the\n" + "default conversation listing. To export them, you need each project's ID.\n" + ) + console.print("How to find a project ID:") + console.print(" 1. Open ChatGPT and click into a Project in the left sidebar.") + console.print(" 2. Look at the browser URL — it will look like:") + console.print(" [dim]https://chatgpt.com/g/[bold]g-p-68c2b2b3037c8191890036fb4ae3ed9f[/bold]-my-project/project[/dim]") + console.print(" 3. Copy the part starting with [bold]g-p-[/bold] up to (but not including) the slug.") + console.print(" Enter multiple IDs separated by commas. Leave blank to skip.\n") + + project_ids_raw = click.prompt( + "ChatGPT project IDs (comma-separated, e.g. g-p-xxx,g-p-yyy)", + default="", + show_default=False, + ).strip() + + if project_ids_raw: + ids = [pid.strip() for pid in project_ids_raw.split(",") if pid.strip()] + valid = [pid for pid in ids if pid.startswith("g-p-")] + invalid = [pid for pid in ids if not pid.startswith("g-p-")] + if invalid: + console.print(f"[yellow]Warning: skipping IDs that don't start with 'g-p-': {invalid}[/yellow]") + if valid: + _write_token_to_env("CHATGPT_PROJECT_IDS", ",".join(valid)) + console.print(f"[green]Saved {len(valid)} project ID(s).[/green]") + else: + console.print("[yellow]No valid project IDs — skipping.[/yellow]") + else: + console.print("[dim]Skipped project IDs.[/dim]") + def _auth_claude(os_name: str) -> None: console.print("\n[bold]─── Claude ───[/bold]") @@ -395,6 +433,15 @@ def _print_doctor_table(checks: list[dict]) -> None: default=None, help="Only export conversations updated after this date (YYYY-MM-DD).", ) +@click.option( + "--project", + "project_filter", + default=None, + help=( + "Only export conversations in a matching project (case-insensitive substring). " + "Use 'none' for conversations outside any project." + ), +) @click.option("--dry-run", is_flag=True, help="Show what would be exported without writing anything.") @click.pass_context def export( @@ -403,6 +450,7 @@ def export( fmt: str, output_dir: str | None, since: str | None, + project_filter: str | None, dry_run: bool, ) -> None: """Export new and updated conversations to Markdown or JSON. @@ -474,6 +522,12 @@ def export( summary[prov_name]["failed"] += len(all_convs) if "all_convs" in dir() else 0 continue + if project_filter is not None: + all_convs = _filter_by_project(all_convs, project_filter) + console.print( + f" [dim]--project filter '{project_filter}': {len(all_convs)} matching conversations.[/dim]" + ) + to_export = cache.get_new_or_updated(prov_name, all_convs) skipped = len(all_convs) - len(to_export) summary[prov_name]["skipped"] = skipped @@ -522,13 +576,11 @@ def export( progress.advance(task) except ProviderError as e: - logger = logging.getLogger(__name__) logger.error("Failed to export conversation %s: %s", conv_id[:8], e) summary[prov_name]["failed"] += 1 progress.advance(task) continue except OSError as e: - logger = logging.getLogger(__name__) logger.error("File write failed for conversation %s: %s", conv_id[:8], e) summary[prov_name]["failed"] += 1 progress.advance(task) @@ -560,7 +612,21 @@ def _resolve_providers(provider: str, cfg) -> list[tuple[str, object]]: from src.providers.claude import ClaudeProvider if provider in ("chatgpt", "all"): - try_add("chatgpt", cfg.chatgpt_session_token, ChatGPTProvider) + if cfg.chatgpt_session_token: + try: + result.append(( + "chatgpt", + ChatGPTProvider( + session_token=cfg.chatgpt_session_token, + project_ids=cfg.chatgpt_project_ids, + ), + )) + except ProviderError as e: + logging.getLogger(__name__).warning( + "[chatgpt] Could not initialise provider: %s", e + ) + elif provider == "chatgpt" or provider == "all": + logging.getLogger(__name__).warning("[chatgpt] Skipping — token not configured.") if provider in ("claude", "all"): try_add("claude", cfg.claude_session_key, ClaudeProvider) @@ -596,6 +662,44 @@ def _print_dry_run_table(prov_name, to_export, prov_instance, export_base, struc console.print(f" [dim]{skipped} conversations already cached (would be skipped).[/dim]") +def _raw_project_name(conv: dict) -> str | None: + """Extract the project name from a raw conversation summary dict. + + Handles both ChatGPT (annotated _project_name) and Claude (project dict). + """ + # ChatGPT: annotated during fetch_all_conversations + if "_project_name" in conv: + return conv["_project_name"] or None + # Claude: project is a dict with a 'name' key, or a plain string + project = conv.get("project") + if isinstance(project, dict): + return project.get("name") or None + if isinstance(project, str): + return project or None + return None + + +def _filter_by_project(convs: list[dict], project_filter: str) -> list[dict]: + """Filter conversations by project name. + + project_filter='none' → keep only conversations with no project. + Otherwise → case-insensitive substring match on the project name. + """ + want_none = project_filter.lower() == "none" + needle = project_filter.lower() + + result = [] + for conv in convs: + name = _raw_project_name(conv) + if want_none: + if name is None: + result.append(conv) + else: + if name and needle in name.lower(): + result.append(conv) + return result + + def _print_export_summary(summary: dict[str, dict[str, int]]) -> None: table = Table(title="Export Summary") table.add_column("Provider", style="bold") @@ -626,8 +730,17 @@ def _print_export_summary(summary: dict[str, dict[str, int]]) -> None: default="all", show_default=True, ) +@click.option( + "--project", + "project_filter", + default=None, + help=( + "Only list conversations in a matching project (case-insensitive substring). " + "Use 'none' for conversations outside any project." + ), +) @click.pass_context -def list_conversations(ctx: click.Context, provider: str) -> None: +def list_conversations(ctx: click.Context, provider: str, project_filter: str | None) -> None: """List conversations without exporting them.""" debug = ctx.obj.get("debug", False) cfg = _load_config_or_exit(debug) @@ -641,6 +754,9 @@ def list_conversations(ctx: click.Context, provider: str) -> None: _handle_provider_error(e, debug) continue + if project_filter is not None: + all_convs = _filter_by_project(all_convs, project_filter) + table = Table() table.add_column("Title") table.add_column("Project") @@ -649,9 +765,7 @@ def list_conversations(ctx: click.Context, provider: str) -> None: for conv in all_convs: title = conv.get("title") or "Untitled" - project = conv.get("project_title") or "" - if isinstance(conv.get("project"), dict): - project = conv["project"].get("name", "") + project = _raw_project_name(conv) or "" updated = (conv.get("updated_at") or conv.get("update_time") or "")[:10] conv_id = (conv.get("id") or conv.get("uuid") or "")[:8] table.add_row(title[:60], project[:30], updated, conv_id) @@ -700,6 +814,240 @@ def cache(ctx: click.Context, show: bool, clear: bool, provider: str) -> None: console.print("Specify --show or --clear. Use --help for options.") +# ────────────────────────────────────────────────────────────────────────────── +# joplin command +# ────────────────────────────────────────────────────────────────────────────── + + +@cli.command() +@click.option( + "--provider", + type=click.Choice(["chatgpt", "claude", "all"], case_sensitive=False), + default="all", + show_default=True, + help="Which provider's conversations to sync to Joplin.", +) +@click.option( + "--project", + "project_filter", + default=None, + help=( + "Only sync conversations in a matching project (case-insensitive substring). " + "Use 'none' for conversations outside any project." + ), +) +@click.option("--dry-run", is_flag=True, help="Show what would be synced without sending anything to Joplin.") +@click.pass_context +def joplin(ctx: click.Context, provider: str, project_filter: str | None, dry_run: bool) -> None: + """Sync exported conversations to Joplin as notes. + + Reads the local export cache and pushes exported Markdown files to Joplin + via its local REST API. Requires Joplin desktop to be running with the + Web Clipper service enabled. + + Notebooks are created automatically based on provider and project: + exports/chatgpt/my-project/ → "ChatGPT - My Project" notebook + exports/claude/no-project/ → "Claude - No Project" notebook + + Re-running is safe: notes are updated (not duplicated) on subsequent runs. + + Setup: + 1. Open Joplin desktop. + 2. Go to Tools → Options → Web Clipper. + 3. Enable the Web Clipper service. + 4. Copy the Authorization token. + 5. Set JOPLIN_API_TOKEN= in your .env file. + """ + debug = ctx.obj.get("debug", False) + cache_obj: Cache = ctx.obj["cache"] + + cfg = _load_config_or_exit(debug) + + if not cfg.joplin_api_token: + err_console.print( + "[red]JOPLIN_API_TOKEN is not set.[/red]\n" + " 1. Open Joplin → Tools → Options → Web Clipper.\n" + " 2. Enable the Web Clipper service.\n" + " 3. Copy the Authorization token.\n" + " 4. Add [bold]JOPLIN_API_TOKEN=[/bold] to your .env file." + ) + sys.exit(1) + + from src.joplin import JoplinClient, JoplinError, notebook_title + + client = JoplinClient(cfg.joplin_api_url, cfg.joplin_api_token) + + if not dry_run: + console.print(f"[dim]Connecting to Joplin at {cfg.joplin_api_url}…[/dim]") + try: + if not client.ping(): + err_console.print( + "[red]Joplin is not responding.[/red] " + "Make sure Joplin desktop is open and Web Clipper is enabled." + ) + sys.exit(1) + # Ping succeeded but doesn't validate the token — check auth separately + client.validate_token() + except JoplinError as e: + err_console.print(f"[red]Joplin connection error:[/red] {e}") + sys.exit(1) + console.print("[green]Joplin connected and token validated.[/green]") + + # Determine which providers to process + providers_to_sync: list[str] = [] + if provider in ("chatgpt", "all"): + providers_to_sync.append("chatgpt") + if provider in ("claude", "all"): + providers_to_sync.append("claude") + + summary: dict[str, dict[str, int]] = {} + + for prov_name in providers_to_sync: + summary[prov_name] = {"created": 0, "updated": 0, "skipped": 0, "failed": 0} + + pending = cache_obj.get_joplin_pending(prov_name) + logger.debug("[joplin] %s: %d pending before filter", prov_name, len(pending)) + + # Apply --project filter against the cached entry's project field + if project_filter is not None: + want_none = project_filter.lower() == "none" + needle = project_filter.lower() + filtered = [] + for conv_id, entry in pending: + proj = entry.get("project") or None + if want_none: + if proj is None or proj == "no-project": + filtered.append((conv_id, entry)) + else: + if proj and needle in proj.lower(): + filtered.append((conv_id, entry)) + logger.debug( + "[joplin] %s: --project %r filtered %d → %d", + prov_name, project_filter, len(pending), len(filtered), + ) + pending = filtered + + if not pending: + console.print(f"\n[bold cyan][{prov_name.upper()}][/bold cyan] All up to date — nothing to sync.") + continue + + console.print( + f"\n[bold cyan][{prov_name.upper()}][/bold cyan] " + f"{len(pending)} conversation(s) to sync to Joplin." + ) + + if dry_run: + _print_joplin_dry_run_table(prov_name, pending) + continue + + from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console, + ) as progress: + task = progress.add_task(f"Syncing {prov_name}…", total=len(pending)) + + for conv_id, entry in pending: + file_path = entry.get("file_path", "") + title = entry.get("title") or "Untitled" + project = entry.get("project") or None + existing_note_id = entry.get("joplin_note_id") + action = "update" if existing_note_id else "create" + + logger.debug( + "[joplin] %s %s/%s: %s (file=%s)", + action, prov_name, conv_id[:8], title[:60], file_path, + ) + + try: + # Read the exported Markdown file + body = Path(file_path).read_text(encoding="utf-8") + logger.debug("[joplin] Read %d chars from %s", len(body), file_path) + + # Get or create the notebook + nb_title = notebook_title(prov_name, project) + notebook_id = client.get_or_create_notebook(nb_title) + + if existing_note_id: + client.update_note(existing_note_id, title, body) + cache_obj.mark_joplin_synced(prov_name, conv_id, existing_note_id) + summary[prov_name]["updated"] += 1 + else: + note_id = client.create_note(title, body, notebook_id) + cache_obj.mark_joplin_synced(prov_name, conv_id, note_id) + summary[prov_name]["created"] += 1 + + except FileNotFoundError: + logger.warning( + "[joplin] Skipping %s/%s — exported file not found: %s", + prov_name, conv_id[:8], file_path, + ) + summary[prov_name]["skipped"] += 1 + except JoplinError as e: + logger.error( + "[joplin] Failed to %s note for %s/%s: %s", + action, prov_name, conv_id[:8], e, + ) + summary[prov_name]["failed"] += 1 + except OSError as e: + logger.error( + "[joplin] File read error for %s/%s (%s): %s", + prov_name, conv_id[:8], file_path, e, + ) + summary[prov_name]["failed"] += 1 + finally: + progress.advance(task) + + if not dry_run: + _print_joplin_summary(summary) + + +def _print_joplin_dry_run_table(prov_name: str, pending: list[tuple[str, dict]]) -> None: + from src.joplin import notebook_title + + table = Table(title=f"[DRY RUN] {prov_name.upper()} — Would sync {len(pending)} conversation(s)") + table.add_column("Title") + table.add_column("Project") + table.add_column("Notebook") + table.add_column("Action") + + for conv_id, entry in pending[:50]: + title = entry.get("title") or "Untitled" + project = entry.get("project") or "no-project" + nb = notebook_title(prov_name, entry.get("project")) + action = "update" if entry.get("joplin_note_id") else "create" + table.add_row(title[:50], project[:30], nb, action) + + if len(pending) > 50: + table.add_row(f"… and {len(pending) - 50} more", "", "", "") + + console.print(table) + + +def _print_joplin_summary(summary: dict[str, dict[str, int]]) -> None: + table = Table(title="Joplin Sync Summary") + table.add_column("Provider", style="bold") + table.add_column("Created", justify="right") + table.add_column("Updated", justify="right") + table.add_column("Skipped", justify="right") + table.add_column("Failed", justify="right") + + for prov, counts in summary.items(): + table.add_row( + prov.capitalize(), + str(counts["created"]), + str(counts["updated"]), + str(counts["skipped"]), + f"[red]{counts['failed']}[/red]" if counts["failed"] else "0", + ) + + console.print(table) + + # ────────────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────────────── diff --git a/src/providers/base.py b/src/providers/base.py index 0661c5b..34dde89 100644 --- a/src/providers/base.py +++ b/src/providers/base.py @@ -11,6 +11,21 @@ import requests from src.utils import redact_secrets +# curl_cffi has its own exception hierarchy (rooted at CurlError → OSError), +# completely separate from requests.exceptions. Import them so _make_request +# can catch both when a curl_cffi session is in use. +try: + from curl_cffi.requests.exceptions import ( + HTTPError as _CurlHTTPError, + ConnectionError as _CurlConnectionError, + Timeout as _CurlTimeout, + ) +except ImportError: + # Fall back to requests types — catching them twice is harmless. + _CurlHTTPError = requests.HTTPError # type: ignore[misc,assignment] + _CurlConnectionError = requests.ConnectionError # type: ignore[misc,assignment] + _CurlTimeout = requests.Timeout # type: ignore[misc,assignment] + logger = logging.getLogger(__name__) # Request timeouts (connect, read) in seconds @@ -271,7 +286,7 @@ class BaseProvider(ABC): except ProviderError: raise - except (requests.ConnectionError, requests.Timeout) as e: + except (requests.ConnectionError, requests.Timeout, _CurlConnectionError, _CurlTimeout) as e: last_exc = e if attempt > MAX_RETRIES: raise ProviderError( @@ -293,7 +308,7 @@ class BaseProvider(ABC): ) time.sleep(wait) - except requests.HTTPError as e: + except (requests.HTTPError, _CurlHTTPError) as e: raise ProviderError( self.provider_name, f"{method} {url}", e ) from e diff --git a/src/providers/chatgpt.py b/src/providers/chatgpt.py index 90c7400..fd9f8c3 100644 --- a/src/providers/chatgpt.py +++ b/src/providers/chatgpt.py @@ -1,4 +1,23 @@ -"""ChatGPT provider — accesses chat.openai.com internal web API.""" +"""ChatGPT provider — accesses chat.openai.com internal web API. + +ChatGPT Projects discovery +-------------------------- +ChatGPT Projects are internally implemented as "snorlax"-type gizmos with IDs +starting with "g-p-". They are *not* returned by any gizmo listing endpoint +(/gizmos/mine, /gizmos/pinned, /gizmos/discovery, /gizmos/search). The +frontend appears to load project IDs from page-level state, not a dedicated +listing API. + +Therefore, project IDs must be supplied by the user via CHATGPT_PROJECT_IDS. +Each project gizmo ID looks like "g-p-68c2b2b3037c8191890036fb4ae3ed9f" and +can be read from the browser URL when viewing a project: + https://chatgpt.com/g/{project-gizmo-id}-{slug}/project + +Project conversations are fetched via cursor-based pagination at: + GET /backend-api/gizmos/{project_gizmo_id}/conversations?cursor=0 +Response: {"items": [...], "cursor": ""} +Pagination ends when cursor is null or an empty string. +""" import logging import os @@ -34,17 +53,22 @@ class ChatGPTProvider(BaseProvider): provider_name = "chatgpt" - def __init__(self, session_token: str | None = None) -> None: + def __init__( + self, + session_token: str | None = None, + project_ids: list[str] | None = None, + ) -> None: # Pass a curl_cffi session to the base class instead of a requests.Session. # curl_cffi.requests.Session is API-compatible with requests.Session. cf_session = curl_requests.Session(impersonate=IMPERSONATE) super().__init__(session=cf_session) # type: ignore[arg-type] - # Remove the User-Agent set by BaseProvider. curl_cffi sets a UA that is - # consistent with its TLS JA3 fingerprint for chrome120. If we leave a - # mismatched UA (e.g. Chrome/121 header with Chrome/120 TLS), Cloudflare's - # bot detection flags it. Removing it lets curl_cffi manage its own UA. + # Remove headers that curl_cffi manages as part of its Chrome fingerprint. + # Overriding User-Agent, Accept, or Accept-Language with non-Chrome values + # creates header/TLS inconsistencies that Cloudflare's bot detection flags. self._session.headers.pop("User-Agent", None) + self._session.headers.pop("Accept", None) + self._session.headers.pop("Accept-Language", None) token = session_token or os.getenv("CHATGPT_SESSION_TOKEN", "").strip() if not token: @@ -58,6 +82,17 @@ class ChatGPTProvider(BaseProvider): ) self._session_token = token + # Project gizmo IDs (g-p-xxx) whose conversations we'll fetch. + # ChatGPT project conversations do not appear in the default + # /conversations listing — they require explicit project IDs. + self._project_ids: list[str] = project_ids or [] + + # Maps conv_id → project_name; populated by fetch_all_conversations() + self._project_map: dict[str, str] = {} + + # Cache of project_id → display name (avoids re-fetching gizmo details) + self._project_name_cache: dict[str, str] = {} + # Set the session cookie in the cookie jar self._session.cookies.set( "__Secure-next-auth.session-token", @@ -66,10 +101,13 @@ class ChatGPTProvider(BaseProvider): path="/", ) + # Set only Referer and sec-fetch-* headers for the auth exchange. + # Origin is intentionally omitted: Chrome does not send Origin on + # same-origin GET requests, and its presence alongside + # sec-fetch-site: same-origin contradicts the browser fingerprint. self._session.headers.update( { "Referer": "https://chatgpt.com/", - "Origin": "https://chatgpt.com", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", @@ -78,8 +116,16 @@ class ChatGPTProvider(BaseProvider): # Exchange the session cookie for an access token self._access_token: str = self._fetch_access_token() + + # Now set backend-api headers (after auth, so they don't interfere with + # the auth exchange which expects a browser-style request). self._session.headers["Authorization"] = f"Bearer {self._access_token}" - logger.debug("[chatgpt] Session initialised with Chrome TLS impersonation (token: [REDACTED])") + self._session.headers["Accept"] = "application/json" + self._session.headers["Origin"] = "https://chatgpt.com" + logger.debug( + "[chatgpt] Session initialised (Chrome TLS impersonation, %d project ID(s) configured)", + len(self._project_ids), + ) def _fetch_access_token(self) -> str: """Exchange the session cookie for a Bearer access token. @@ -132,14 +178,22 @@ class ChatGPTProvider(BaseProvider): RuntimeError("401 Unauthorized — ChatGPT token expired"), ) + # ------------------------------------------------------------------ + # Default workspace conversations (offset-based pagination) + # ------------------------------------------------------------------ + def list_conversations(self, offset: int = 0, limit: int = 100) -> list[dict]: - """Fetch one page of conversations. + """Fetch one page of conversations from the default workspace. + + Note: Project conversations are NOT included here. They require + separate fetching via list_project_conversations(). Returns: List of conversation summary dicts. """ url = f"{BASE_URL}/conversations" params = {"offset": offset, "limit": limit, "order": "updated"} + logger.debug("[chatgpt] list_conversations: GET %s params=%s", url, params) try: data = self._make_request("GET", url, params=params) except ProviderError: @@ -149,18 +203,315 @@ class ChatGPTProvider(BaseProvider): if not isinstance(data, dict): self._warn_unexpected_schema("list_conversations", "root") + logger.debug("[chatgpt] list_conversations: unexpected root type %s", type(data)) return [] items = data.get("items") if items is None: self._warn_unexpected_schema("list_conversations", "items") + logger.debug("[chatgpt] list_conversations: response keys = %s", list(data.keys())) return [] + logger.debug("[chatgpt] list_conversations: got %d items (offset=%d)", len(items), offset) return items + # ------------------------------------------------------------------ + # Project conversations (cursor-based pagination) + # ------------------------------------------------------------------ + + def _fetch_project_name(self, project_id: str) -> str: + """Fetch the display name for a project gizmo. + + Calls GET /backend-api/gizmos/{project_id} and returns the display + name from gizmo.display.name. Falls back to the project_id itself + if the fetch fails or the name is missing. + + Result is cached in self._project_name_cache. + """ + if project_id in self._project_name_cache: + return self._project_name_cache[project_id] + + url = f"{BASE_URL}/gizmos/{project_id}" + logger.debug("[chatgpt] _fetch_project_name: GET %s", url) + try: + data = self._make_request("GET", url) + gizmo = data.get("gizmo", {}) if isinstance(data, dict) else {} + name = (gizmo.get("display") or {}).get("name") or gizmo.get("name") or "" + name = name.strip() or project_id + gizmo_type = gizmo.get("gizmo_type", "?") + logger.debug( + "[chatgpt] _fetch_project_name[%s]: name=%r gizmo_type=%r", + project_id[:12], + name, + gizmo_type, + ) + except ProviderError as e: + logger.warning( + "[chatgpt] Could not fetch project name for %s: %s — using ID as name", + project_id, + e, + ) + name = project_id + + self._project_name_cache[project_id] = name + return name + + def list_project_conversations( + self, project_id: str, cursor: str = "0" + ) -> tuple[list[dict], str | None]: + """Fetch one page of conversations for a project gizmo. + + Uses cursor-based pagination (not offset). The initial cursor is "0". + Subsequent cursors come from the response's "cursor" field. + + Endpoint: GET /backend-api/gizmos/{project_id}/conversations?cursor= + + Returns: + (items, next_cursor) — next_cursor is None or "" when exhausted. + """ + url = f"{BASE_URL}/gizmos/{project_id}/conversations" + params = {"cursor": cursor} + logger.debug( + "[chatgpt] list_project_conversations[%s]: GET %s cursor=%r", + project_id[:12], + url, + cursor, + ) + + try: + data = self._make_request("GET", url, params=params) + except ProviderError: + raise + except Exception as e: + raise ProviderError(self.provider_name, "list_project_conversations", e) from e + + logger.debug( + "[chatgpt] list_project_conversations[%s]: response type=%s", + project_id[:12], + type(data).__name__, + ) + + if isinstance(data, list): + # Bare list — no next cursor available + logger.debug( + "[chatgpt] list_project_conversations[%s]: bare list with %d items", + project_id[:12], + len(data), + ) + return data, None + + if not isinstance(data, dict): + self._warn_unexpected_schema("list_project_conversations", "root") + logger.debug( + "[chatgpt] list_project_conversations[%s]: unexpected type %s value=%r", + project_id[:12], + type(data), + data, + ) + return [], None + + logger.debug( + "[chatgpt] list_project_conversations[%s]: response keys=%s", + project_id[:12], + list(data.keys()), + ) + + items = data.get("items") or data.get("conversations") or [] + next_cursor = data.get("cursor") or None # empty string → treat as None + + if not items and data: + logger.debug( + "[chatgpt] list_project_conversations[%s]: no items found; full response=%r", + project_id[:12], + data, + ) + + logger.debug( + "[chatgpt] list_project_conversations[%s]: %d items, next_cursor=%r", + project_id[:12], + len(items), + next_cursor[:20] + "…" if next_cursor and len(next_cursor) > 20 else next_cursor, + ) + return items, next_cursor + + # ------------------------------------------------------------------ + # Combined fetch (default workspace + all configured projects) + # ------------------------------------------------------------------ + + def fetch_all_conversations(self, since=None) -> list[dict]: + """Fetch all conversations: default workspace + every configured project. + + ChatGPT project conversations are not included in the default + /conversations listing. They must be fetched separately via the + gizmos conversations endpoint using project IDs from CHATGPT_PROJECT_IDS. + + Builds self._project_map (conv_id → project_name) as a side effect so + that normalize_conversation() can attach the project name without an + additional API call. + + Args: + since: Optional datetime — only return conversations updated at or + after this time (client-side filter, same as base class). + + Returns: + Combined list of raw conversation summary dicts. + """ + # Reset maps so a fresh fetch always rebuilds them cleanly + self._project_map = {} + + # --- Default workspace (base class handles offset-based pagination) --- + logger.info("[chatgpt] Fetching default workspace conversations…") + default_convs = super().fetch_all_conversations(since=None) + logger.info("[chatgpt] Default workspace: %d conversations", len(default_convs)) + + # --- Project conversations --- + if not self._project_ids: + logger.info( + "[chatgpt] No project IDs configured — skipping project conversations. " + "To include projects, set CHATGPT_PROJECT_IDS in .env " + "(see 'python -m src.main auth' for instructions)." + ) + return self._apply_since_filter(default_convs, since) + + logger.info( + "[chatgpt] Fetching conversations for %d project(s): %s", + len(self._project_ids), + self._project_ids, + ) + + project_convs: list[dict] = [] + for project_id in self._project_ids: + project_name = self._fetch_project_name(project_id) + logger.info( + "[chatgpt] Project '%s' (%s): fetching conversations…", + project_name, + project_id, + ) + + cursor: str = "0" + page = 0 + project_total = 0 + + while True: + page += 1 + logger.debug( + "[chatgpt] Project '%s': page %d cursor=%r", + project_name, + page, + cursor[:20] + "…" if len(cursor) > 20 else cursor, + ) + + try: + batch, next_cursor = self.list_project_conversations( + project_id, cursor=cursor + ) + except ProviderError as e: + logger.warning( + "[chatgpt] Project '%s': failed to fetch page %d: %s — stopping pagination", + project_name, + page, + e, + ) + break + + if not batch: + logger.debug( + "[chatgpt] Project '%s': empty batch on page %d — done", + project_name, + page, + ) + break + + for conv in batch: + conv_id = conv.get("id") + if conv_id: + self._project_map[conv_id] = project_name + else: + logger.debug( + "[chatgpt] Project '%s': conversation with no id: %r", + project_name, + conv, + ) + # Annotate so callers can filter by project without the map + conv["_project_name"] = project_name + + project_convs.extend(batch) + project_total += len(batch) + + logger.debug( + "[chatgpt] Project '%s': page %d → %d items (project total: %d)", + project_name, + page, + len(batch), + project_total, + ) + + if not next_cursor: + logger.debug( + "[chatgpt] Project '%s': no next cursor — pagination complete", + project_name, + ) + break + + cursor = next_cursor + + logger.info( + "[chatgpt] Project '%s': %d conversations fetched", + project_name, + project_total, + ) + + all_convs = default_convs + project_convs + logger.info( + "[chatgpt] Total: %d conversations (%d default + %d from %d project(s))", + len(all_convs), + len(default_convs), + len(project_convs), + len(self._project_ids), + ) + logger.debug( + "[chatgpt] _project_map: %d entries → %s", + len(self._project_map), + {k[:8]: v for k, v in self._project_map.items()}, + ) + + return self._apply_since_filter(all_convs, since) + + def _apply_since_filter(self, convs: list[dict], since) -> list[dict]: + """Filter conversations to those updated at or after `since`.""" + if since is None: + return convs + + since_naive = since.replace(tzinfo=None) + filtered = [] + for c in convs: + raw_ts = c.get("updated_at") or c.get("update_time") or "" + if raw_ts: + try: + from src.utils import _parse_dt + updated = _parse_dt(str(raw_ts)).replace(tzinfo=None) + if updated >= since_naive: + filtered.append(c) + except Exception: + filtered.append(c) # include if date unparseable + else: + filtered.append(c) + + logger.info( + "[chatgpt] After --since filter: %d/%d conversations", + len(filtered), + len(convs), + ) + return filtered + + # ------------------------------------------------------------------ + # Single conversation detail + # ------------------------------------------------------------------ + def get_conversation(self, conv_id: str) -> dict: """Fetch full conversation detail for a single ID.""" url = f"{BASE_URL}/conversation/{conv_id}" + logger.debug("[chatgpt] get_conversation: GET %s", url) try: data = self._make_request("GET", url) except ProviderError: @@ -172,25 +523,41 @@ class ChatGPTProvider(BaseProvider): self._warn_unexpected_schema("get_conversation", "root") return {} + logger.debug( + "[chatgpt] get_conversation[%s]: keys=%s mapping_size=%d", + conv_id[:8], + list(data.keys()), + len(data.get("mapping", {})), + ) return data + # ------------------------------------------------------------------ + # Normalization + # ------------------------------------------------------------------ + def normalize_conversation(self, raw: dict) -> dict: """Transform ChatGPT raw schema to the common normalized schema. ChatGPT stores messages in a nested ``mapping`` dict where each node has an ``id``, ``message``, and ``children`` list. We walk the tree from the root node to build a flat ordered message list. + + Project name is looked up from self._project_map (populated by + fetch_all_conversations). The conversation detail endpoint does not + include project information. """ conv_id = raw.get("id", "") title = raw.get("title") or "Untitled" created_at = _ts_to_iso(raw.get("create_time")) updated_at = _ts_to_iso(raw.get("update_time")) - # Project info — ChatGPT calls it "gizmo_id" or stores project info differently. - # As of 2024, personal projects appear as a separate projects API; conversations - # linked to a project have a non-null `workspace_id` or similar field. - # We use `project_title` if present, else None. - project: str | None = raw.get("project_title") or raw.get("workspace_title") or None + # Look up project name from the map built during fetch_all_conversations. + project = self._project_map.get(conv_id) if conv_id else None + logger.debug( + "[chatgpt] normalize_conversation[%s]: project_map lookup → %r", + conv_id[:8] if conv_id else "?", + project, + ) mapping: dict = raw.get("mapping", {}) messages = _extract_messages(mapping, raw, conv_id) diff --git a/tests/test_joplin.py b/tests/test_joplin.py new file mode 100644 index 0000000..c0284c4 --- /dev/null +++ b/tests/test_joplin.py @@ -0,0 +1,341 @@ +"""Unit tests for src/joplin.py (JoplinClient).""" + +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from src.joplin import JoplinClient, JoplinError, _http_error_message, _timeout_message, notebook_title + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_client() -> JoplinClient: + return JoplinClient(base_url="http://localhost:41184", token="test-token") + + +def _mock_response(json_data=None, text="", status_code=200): + resp = MagicMock() + resp.status_code = status_code + resp.text = text + resp.json.return_value = json_data or {} + resp.raise_for_status = MagicMock() + if status_code >= 400: + resp.raise_for_status.side_effect = requests.exceptions.HTTPError( + response=resp + ) + return resp + + +# --------------------------------------------------------------------------- +# notebook_title helper +# --------------------------------------------------------------------------- + + +class TestNotebookTitle: + def test_no_project(self): + assert notebook_title("chatgpt", None) == "ChatGPT - No Project" + + def test_no_project_string(self): + assert notebook_title("chatgpt", "no-project") == "ChatGPT - No Project" + + def test_project_with_hyphens(self): + assert notebook_title("chatgpt", "my-project") == "ChatGPT - My Project" + + def test_claude_provider(self): + assert notebook_title("claude", "budget-tracker") == "Claude - Budget Tracker" + + def test_multi_word_project(self): + assert notebook_title("claude", "ai-research-notes") == "Claude - Ai Research Notes" + + +# --------------------------------------------------------------------------- +# ping +# --------------------------------------------------------------------------- + + +class TestPing: + def test_ping_success(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response(text="JoplinClipperServer") + assert client.ping() is True + + def test_ping_not_joplin(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response(text="SomeOtherServer") + assert client.ping() is False + + def test_ping_connection_refused(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.ConnectionError() + assert client.ping() is False + + def test_ping_timeout_returns_false(self): + """Ping timeout is not an error — Joplin just isn't responding.""" + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.Timeout() + assert client.ping() is False + + def test_ping_invalid_url_raises_joplin_error(self): + """Non-connection, non-timeout errors (e.g. invalid URL) surface as JoplinError.""" + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.InvalidURL("bad url") + with pytest.raises(JoplinError): + client.ping() + + +class TestValidateToken: + def test_validate_token_success(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response(json_data={"items": [], "has_more": False}) + client.validate_token() # should not raise + + def test_validate_token_401_raises_joplin_error(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response(status_code=401) + with pytest.raises(JoplinError, match="401"): + client.validate_token() + + +class TestTimeoutMessage: + def test_includes_timeout_duration(self): + import src.joplin as joplin_module + msg = _timeout_message("POST", "/notes") + assert "POST" in msg + assert "/notes" in msg + assert str(joplin_module._REQUEST_TIMEOUT) in msg + + def test_includes_actionable_hints(self): + msg = _timeout_message("PUT", "/notes/abc") + assert "JOPLIN_REQUEST_TIMEOUT" in msg + # Should mention at least one cause + assert "large" in msg.lower() or "busy" in msg.lower() or "frozen" in msg.lower() + + +class TestTimeoutHandling: + def test_get_timeout_raises_joplin_error_with_clear_message(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.Timeout() + with pytest.raises(JoplinError) as exc_info: + client._get("/folders") + assert "timed out" in str(exc_info.value).lower() + assert "JOPLIN_REQUEST_TIMEOUT" in str(exc_info.value) + + def test_post_timeout_raises_joplin_error_with_clear_message(self): + client = _make_client() + with patch("requests.post") as mock_post: + mock_post.side_effect = requests.exceptions.Timeout() + with pytest.raises(JoplinError) as exc_info: + client._post("/notes", {"title": "Test"}) + assert "timed out" in str(exc_info.value).lower() + + def test_put_timeout_raises_joplin_error_with_clear_message(self): + client = _make_client() + with patch("requests.put") as mock_put: + mock_put.side_effect = requests.exceptions.Timeout() + with pytest.raises(JoplinError) as exc_info: + client._put("/notes/abc", {"title": "Test"}) + assert "timed out" in str(exc_info.value).lower() + + def test_create_note_timeout_propagates(self): + """Timeout on create_note surfaces as JoplinError, not raw requests exception.""" + client = _make_client() + with patch("requests.post") as mock_post: + mock_post.side_effect = requests.exceptions.Timeout() + with pytest.raises(JoplinError, match="timed out"): + client.create_note("Big Note", "x" * 100_000, "nb-123") + + def test_update_note_timeout_propagates(self): + client = _make_client() + with patch("requests.put") as mock_put: + mock_put.side_effect = requests.exceptions.Timeout() + with pytest.raises(JoplinError, match="timed out"): + client.update_note("note-id", "Big Note", "x" * 100_000) + + +class TestHttpErrorMessage: + def test_401_gives_token_hint(self): + resp = MagicMock() + resp.status_code = 401 + resp.text = "Unauthorized" + e = requests.exceptions.HTTPError(response=resp) + msg = _http_error_message("GET", "/folders", e) + assert "401" in msg + assert "token" in msg.lower() + + def test_404_gives_deleted_note_hint(self): + resp = MagicMock() + resp.status_code = 404 + resp.text = "Not Found" + e = requests.exceptions.HTTPError(response=resp) + msg = _http_error_message("PUT", "/notes/abc", e) + assert "404" in msg + assert "deleted" in msg.lower() + + def test_other_error_includes_status_and_body(self): + resp = MagicMock() + resp.status_code = 500 + resp.text = "Internal Server Error" + e = requests.exceptions.HTTPError(response=resp) + msg = _http_error_message("POST", "/notes", e) + assert "500" in msg + + +# --------------------------------------------------------------------------- +# list_notebooks +# --------------------------------------------------------------------------- + + +class TestListNotebooks: + def test_list_notebooks_single_page(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response( + json_data={"items": [{"id": "nb1", "title": "ChatGPT - No Project"}], "has_more": False} + ) + result = client.list_notebooks() + assert len(result) == 1 + assert result[0]["id"] == "nb1" + + def test_list_notebooks_paginated(self): + client = _make_client() + page1 = _mock_response( + json_data={"items": [{"id": "nb1", "title": "A"}], "has_more": True} + ) + page2 = _mock_response( + json_data={"items": [{"id": "nb2", "title": "B"}], "has_more": False} + ) + with patch("requests.get") as mock_get: + mock_get.side_effect = [page1, page2] + result = client.list_notebooks() + assert len(result) == 2 + assert {nb["id"] for nb in result} == {"nb1", "nb2"} + + def test_list_notebooks_connection_error(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.ConnectionError() + with pytest.raises(JoplinError, match="Joplin"): + client.list_notebooks() + + +# --------------------------------------------------------------------------- +# get_or_create_notebook +# --------------------------------------------------------------------------- + + +class TestGetOrCreateNotebook: + def test_returns_existing_notebook_id(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response( + json_data={ + "items": [{"id": "nb-existing", "title": "ChatGPT - No Project"}], + "has_more": False, + } + ) + nb_id = client.get_or_create_notebook("ChatGPT - No Project") + assert nb_id == "nb-existing" + + def test_creates_new_notebook_when_not_found(self): + client = _make_client() + with patch("requests.get") as mock_get, patch("requests.post") as mock_post: + mock_get.return_value = _mock_response( + json_data={"items": [], "has_more": False} + ) + mock_post.return_value = _mock_response( + json_data={"id": "nb-new", "title": "ChatGPT - New Project"} + ) + nb_id = client.get_or_create_notebook("ChatGPT - New Project") + assert nb_id == "nb-new" + mock_post.assert_called_once() + + def test_caches_notebook_after_first_load(self): + client = _make_client() + with patch("requests.get") as mock_get: + mock_get.return_value = _mock_response( + json_data={ + "items": [{"id": "nb1", "title": "Claude - No Project"}], + "has_more": False, + } + ) + # Call twice — GET /folders should only happen once + client.get_or_create_notebook("Claude - No Project") + client.get_or_create_notebook("Claude - No Project") + assert mock_get.call_count == 1 + + +# --------------------------------------------------------------------------- +# create_note +# --------------------------------------------------------------------------- + + +class TestCreateNote: + def test_create_note_returns_id(self): + client = _make_client() + with patch("requests.post") as mock_post: + mock_post.return_value = _mock_response( + json_data={"id": "note-123", "title": "My Note"} + ) + note_id = client.create_note("My Note", "Note body", "nb-456") + assert note_id == "note-123" + _, kwargs = mock_post.call_args + assert kwargs["json"]["title"] == "My Note" + assert kwargs["json"]["body"] == "Note body" + assert kwargs["json"]["parent_id"] == "nb-456" + + def test_create_note_connection_error(self): + client = _make_client() + with patch("requests.post") as mock_post: + mock_post.side_effect = requests.exceptions.ConnectionError() + with pytest.raises(JoplinError, match="Joplin"): + client.create_note("Title", "Body", "nb-id") + + def test_create_note_http_error(self): + client = _make_client() + with patch("requests.post") as mock_post: + mock_post.return_value = _mock_response(status_code=401) + with pytest.raises(JoplinError): + client.create_note("Title", "Body", "nb-id") + + +# --------------------------------------------------------------------------- +# update_note +# --------------------------------------------------------------------------- + + +class TestUpdateNote: + def test_update_note_calls_put(self): + client = _make_client() + with patch("requests.put") as mock_put: + mock_put.return_value = _mock_response(json_data={"id": "note-123"}) + client.update_note("note-123", "New Title", "New Body") + mock_put.assert_called_once() + _, kwargs = mock_put.call_args + assert kwargs["json"]["title"] == "New Title" + assert kwargs["json"]["body"] == "New Body" + + def test_update_note_connection_error(self): + client = _make_client() + with patch("requests.put") as mock_put: + mock_put.side_effect = requests.exceptions.ConnectionError() + with pytest.raises(JoplinError, match="Joplin"): + client.update_note("note-id", "Title", "Body") + + def test_update_note_http_error(self): + client = _make_client() + with patch("requests.put") as mock_put: + mock_put.return_value = _mock_response(status_code=404) + with pytest.raises(JoplinError): + client.update_note("note-id", "Title", "Body") diff --git a/tests/test_providers.py b/tests/test_providers.py index c2b3c8e..efbcc21 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -13,15 +13,17 @@ class TestChatGPTNormalization: def _get_provider(self): from src.providers.chatgpt import ChatGPTProvider - import unittest.mock as mock # Bypass __init__ token check p = ChatGPTProvider.__new__(ChatGPTProvider) import requests p._session = requests.Session() p._org_id = None + p._project_ids = [] + p._project_map = {} + p._project_name_cache = {} return p - def test_normalizes_with_project(self): + def test_normalizes_conversation(self): raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) p = self._get_provider() result = p.normalize_conversation(raw) @@ -29,7 +31,8 @@ class TestChatGPTNormalization: assert result["id"] == "chatgpt-conv-001" assert result["title"] == "Python Async Tutorial" assert result["provider"] == "chatgpt" - assert result["project"] == "Learning Python" + # No entry in _project_map → project is None + assert result["project"] is None assert result["created_at"] != "" assert result["updated_at"] != "" assert isinstance(result["messages"], list) @@ -42,6 +45,15 @@ class TestChatGPTNormalization: assert result["project"] is None assert result["id"] == "chatgpt-conv-002" + def test_normalizes_with_project_from_map(self): + """Project name from _project_map (populated by fetch_all_conversations) flows through.""" + raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) + p = self._get_provider() + p._project_map["chatgpt-conv-001"] = "My Research Project" + result = p.normalize_conversation(raw) + + assert result["project"] == "My Research Project" + def test_extracts_text_messages(self): raw = json.loads((FIXTURES / "chatgpt_conversation.json").read_text()) p = self._get_provider()