-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
168 lines (145 loc) · 4.4 KB
/
Makefile
File metadata and controls
168 lines (145 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
.PHONY: install dev clean test test-unit test-integration process chunk chunk-all index pipeline help
# Colors
GREEN := \033[0;32m
YELLOW := \033[1;33m
RED := \033[0;31m
NC := \033[0m
# Directories
SRC_DIR := dataset/src
RES_DIR := dataset/res
CHUNKS_DIR := dataset/chunks
# Default target
help:
@echo "Data Ingest Pipeline"
@echo ""
@echo "Setup:"
@echo " make install Install dependencies"
@echo " make dev Install with dev dependencies"
@echo ""
@echo "Pipeline:"
@echo " make process Step 1: Extract documents from dataset/src/"
@echo " make chunk Step 2: Generate contextual chunks (all files)"
@echo " make chunk FILE=x Step 2: Generate chunks for single file"
@echo " make index Step 3: Index chunks to Pinecone"
@echo " make pipeline Run full pipeline (process → chunk → index)"
@echo ""
@echo "Testing:"
@echo " make test Run unit tests"
@echo " make test-integration Run integration tests (real APIs)"
@echo " make coverage Run tests with coverage report"
@echo ""
@echo "Utilities:"
@echo " make query Run retrieval examples"
@echo " make stats Show Pinecone index statistics"
@echo " make clean Remove all generated files"
# Setup
install:
uv sync
dev:
uv sync --extra dev
# Pipeline steps
process:
uv run python -m src.process_documents
chunk:
ifdef FILE
@echo "Processing single file: $(FILE)"
uv run python -m src.contextual_chunking --single $(FILE)
else
@echo "=========================================="
@echo "Chunking All Documents"
@echo "=========================================="
@mkdir -p $(CHUNKS_DIR)
@total=0; success=0; failed=0; \
for file in $(RES_DIR)/*.json; do \
if [ -f "$$file" ]; then \
total=$$((total + 1)); \
filename=$$(basename "$$file"); \
echo ""; \
echo "$(YELLOW)[$$total] Processing: $$filename$(NC)"; \
if uv run python -m src.contextual_chunking --single "$$file" 2>&1; then \
success=$$((success + 1)); \
echo "$(GREEN)✓ Success: $$filename$(NC)"; \
else \
failed=$$((failed + 1)); \
echo "$(RED)✗ Failed: $$filename$(NC)"; \
fi; \
fi; \
done; \
echo ""; \
echo "=========================================="; \
echo "Complete: $$success/$$total succeeded"; \
if [ $$failed -gt 0 ]; then \
echo "$(RED)Failed: $$failed$(NC)"; \
exit 1; \
fi
endif
chunk-basic:
ifdef FILE
uv run python -m src.contextual_chunking --single $(FILE) --chunking basic
else
@echo "Processing all files with basic chunking..."
@for file in $(RES_DIR)/*.json; do \
if [ -f "$$file" ]; then \
echo "Processing: $$file"; \
uv run python -m src.contextual_chunking --single "$$file" --chunking basic; \
fi; \
done
endif
chunk-semantic:
ifdef FILE
uv run python -m src.contextual_chunking --single $(FILE) --chunking semantic
else
@echo "Processing all files with semantic chunking..."
@for file in $(RES_DIR)/*.json; do \
if [ -f "$$file" ]; then \
echo "Processing: $$file"; \
uv run python -m src.contextual_chunking --single "$$file" --chunking semantic; \
fi; \
done
endif
index:
uv run python -m src.indexing
pipeline: process chunk index
@echo ""
@echo "$(GREEN)Pipeline complete!$(NC)"
# Testing
test: test-unit
test-unit:
uv run pytest tests/ -v --ignore=tests/test_integration.py
test-integration:
uv run pytest tests/test_integration.py --run-integration -v
test-all:
uv run pytest tests/ --run-integration -v
coverage:
uv run pytest tests/ --cov=src --cov-report=term-missing --cov-report=html
@echo ""
@echo "HTML report: htmlcov/index.html"
# Utilities
query:
uv run python -m src.retrieve
stats:
@uv run python -c "\
from src import PineconeIndexer; \
import os; \
i = PineconeIndexer(index_name=os.getenv('PINECONE_INDEX_NAME')); \
s = i.get_index_stats(); \
print(f'Vectors: {s[\"total_vectors\"]:,}'); \
print(f'Dimensions: {s[\"dimensions\"]}'); \
print(f'Fullness: {s[\"index_fullness\"]:.2%}')"
lint:
uv run ruff check src/ tests/
format:
uv run ruff format src/ tests/
# Cleaning
clean:
rm -rf $(CHUNKS_DIR)/* $(RES_DIR)/*
rm -rf __pycache__ src/__pycache__ tests/__pycache__
rm -rf .pytest_cache .coverage htmlcov/
@echo "$(GREEN)Cleaned all generated files$(NC)"
clean-chunks:
rm -rf $(CHUNKS_DIR)/*
@echo "Cleaned $(CHUNKS_DIR)/"
clean-cache:
rm -rf __pycache__ src/__pycache__ tests/__pycache__
rm -rf .pytest_cache .coverage htmlcov/
@echo "Cleaned cache files"