Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 1 addition & 195 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,195 +1 @@
# Database files
*.db

# Python specific
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
env/
venv/
.venv/
.ENV
.python-version
pip-log.txt
pip-delete-this-directory.txt
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.log
*.pot
*.po
*~
.pytest_cache/
.mypy_cache/
.hypothesis/
.eggs/
*.egg-info/
.installed.cfg
.shared_clones/

# IDE and editor files
.vscode/
.idea/
*.swp
*.swo
*.tmp

# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
desktop.ini

# Build and distribution artifacts
dist/
build/
*.egg
*.pyc
*.pyo
*.pyd
*.so
*.dylib
*.dll
*.exe
*.out
*.o
*.obj
target/
.gradle/
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*

# Environment variables
.env
.env.local
*.env
.env.*
!.env.example

# Testing
.coverage
htmlcov/
.tox/
.nox/
coverage/
*.coverage
.coverage.*

# Security
secret.key
secrets.json
config/secrets.yml

# Sync state and temporary data
data/db/sync_state.db
*.tmp
*.temp
*.bak
*.backup
*.swp
*.swo
*~
.DS_Store
Thumbs.db
*.lock
*.log
*.out
*.pid
*.seed
*.id
*.idx
*.dat
*.bin
*.cache
*.cached
*.session
*.sqlite
*.db
*.db-shm
*.db-wal
*.fdb
*.fdb-shm
*.fdb-wal
*.mdb
*.ndb
*.sdb
*.sdb-shm
*.sdb-wal
*.ldb
*.idb
*.pdb
*.gdb
*.gdb-index
*.core
*.stackdump
*.dmp
*.crash
*.crashpad
*.minidump
*.dSYM/
*.sym
*.map
*.lst
*.asm
*.o
*.obj
*.lib
*.a
*.so
*.dylib
*.dll
*.exe
*.out
*.jar
*.war
*.ear
*.zip
*.tar
*.gz
*.tgz
*.bz2
*.xz
*.7z
*.rar
*.zst
*.lz4
*.lzh
*.cab
*.arj
*.rpm
*.deb
*.Z
*.lz
*.lzo
*.tar.gz
*.tar.bz2
*.tar.xz
*.tar.zst
*.tmp
*.temp
*.tmp.*
*.temp.*
*.tmp-*
*.temp-*
*.tmp_*
*.temp_*
*.tmp/*
*.temp/*
*.tmp.*/**
*.temp.*/**
*.tmp-*/**
*.temp-*/**
*.tmp_*/**
*.temp_*/**
(src/fetchers/who_client.py)
Binary file added src/core/__pycache__/db.cpython-312.pyc
Binary file not shown.
Binary file added src/fetchers/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file not shown.
23 changes: 19 additions & 4 deletions src/fetchers/who_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ async def process_batch_async(
# Create semaphore to limit concurrent requests
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

# Track progress for checkpoint logging
successful_fetches = 0
failed_fetches = 0

# Create aiohttp session
async with aiohttp.ClientSession() as session:
# Create tasks for all nodes in batch
Expand All @@ -281,16 +285,26 @@ async def process_batch_async(
if isinstance(result, Exception):
# Task failed
failed_count += 1
failed_fetches += 1
console.print(f"[yellow]Failed to process {uri}: {result}[/yellow]")
continue

# Success - result is (uri, node_data)
_, node_data = result

if node_data is None:
failed_count += 1
failed_fetches += 1
continue

# Check if we got actual data (not empty response)
if not node_data or (isinstance(node_data, dict) and len(node_data) == 0):
console.print(f"[yellow]Empty response for {uri}, leaving as PENDING for retry[/yellow]")
failed_count += 1
continue

successful_fetches += 1

try:
# Extract node details
title_raw = node_data.get("title", "")
Expand Down Expand Up @@ -325,6 +339,10 @@ async def process_batch_async(
description = note.get("value", "")
break

# Tree Traversal: Collect child URIs for bulk insert BEFORE updating status
child_uris = extract_child_uris(node_data)
all_child_uris.extend(child_uris)

# Update current node with fetched data and mark as BASE_DONE
update_node_data(
conn,
Expand All @@ -336,10 +354,6 @@ async def process_batch_async(
status="BASE_DONE",
)

# Tree Traversal: Collect child URIs for bulk insert
child_uris = extract_child_uris(node_data)
all_child_uris.extend(child_uris)

processed_count += 1

except Exception as e:
Expand All @@ -357,6 +371,7 @@ async def process_batch_async(
conn.commit()

console.print(f"[green]Batch complete: {processed_count} processed, {failed_count} failed[/green]")
console.print(f"[dim]API fetch stats: {successful_fetches} successful, {failed_fetches} failed[/dim]")
console.print(f"[green]Remaining PENDING: {count_nodes_by_status(conn, 'PENDING')}[/green]")
console.print(f"[green]Total BASE_DONE: {count_nodes_by_status(conn, 'BASE_DONE')}[/green]")

Expand Down
Loading