-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine_transcripts.py
216 lines (171 loc) · 7.62 KB
/
combine_transcripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Combine Transcripts - A utility script to merge multiple transcript files into one.
This script provides functionality to:
1. Discover transcript files in a specified directory
2. Merge multiple transcripts into a single file
3. Add clear separators and headers for readability
4. Handle errors gracefully with informative messages
Author: OkhDev
Version: 1.0.1
"""
import os
from pathlib import Path
from typing import List, Optional
from datetime import datetime
# ============================================================================
# Constants and Configuration
# ============================================================================
TRANSCRIPTS_DIR = Path("transcripts")
DEFAULT_OUTPUT = Path("combined_transcript.txt")
class Colors:
"""ANSI color codes for terminal output."""
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
RESET = '\033[0m'
class Symbols:
"""Unicode symbols for status messages."""
CHECK = '✓'
CROSS = '✗'
WARNING = '⚠'
INFO = 'ℹ'
# ============================================================================
# Utility Functions
# ============================================================================
def print_status(message: str, status: str = "info") -> None:
"""Print a formatted status message with appropriate color and symbol."""
status_config = {
"success": (Colors.GREEN, Symbols.CHECK),
"error": (Colors.RED, Symbols.CROSS),
"warning": (Colors.YELLOW, Symbols.WARNING),
"info": (Colors.RESET, Symbols.INFO),
}
color, symbol = status_config.get(status, (Colors.RESET, Symbols.INFO))
print(f"{color}{symbol} {message}{Colors.RESET}")
def get_output_filename() -> Path:
"""
Generate a unique output filename using current timestamp.
Returns:
Path: Path object with format 'combined_transcript_YYYYMMDD_HHMMSS.txt'
Note:
Ensures unique filenames to prevent overwriting previous combinations
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return Path(f"combined_transcript_{timestamp}.txt")
# ============================================================================
# Core Functionality
# ============================================================================
class TranscriptCombiner:
"""
Handles the combination of multiple transcript files into a single document.
This class manages:
1. File discovery and validation
2. Content merging with proper formatting
3. Output file generation with timestamps
4. Error handling and user feedback
Attributes:
input_dir (Path): Source directory containing transcript files
output_file (Path): Destination file for combined transcripts
"""
def __init__(self, input_dir: Path = TRANSCRIPTS_DIR, output_file: Optional[Path] = None):
"""
Initialize the TranscriptCombiner with input and output paths.
Args:
input_dir (Path): Directory containing transcript files to combine
output_file (Optional[Path]): Custom output file path. If None, generates timestamped name
Note:
- Default input directory is 'transcripts/'
- Default output uses timestamp for unique naming
"""
self.input_dir = input_dir
self.output_file = output_file or get_output_filename()
def get_transcript_files(self) -> List[Path]:
"""
Discover and validate transcript files in the input directory.
Returns:
List[Path]: Sorted list of paths to valid transcript files
Note:
- Only processes .txt files
- Returns files in sorted order for consistent output
- Provides feedback about number of files found
"""
if not self.input_dir.exists():
print_status(f"Directory '{self.input_dir}' not found", "error")
return []
transcript_files = sorted(self.input_dir.glob("*.txt"))
if not transcript_files:
print_status("No transcript files found", "warning")
else:
print_status(f"Found {len(transcript_files)} transcript file(s)", "success")
return transcript_files
def combine_files(self, transcript_files: List[Path]) -> bool:
"""
Merge multiple transcript files into a single document.
Args:
transcript_files (List[Path]): List of transcript files to combine
Returns:
bool: True if combination was successful, False if any errors occurred
Process:
1. Creates output file with unique name
2. Processes each input file sequentially
3. Adds headers and separators for clarity
4. Handles encoding and file operations safely
5. Provides progress feedback for each file
"""
if not transcript_files:
return False
try:
print_status("Starting file combination...", "info")
with open(self.output_file, 'w', encoding='utf-8') as outfile:
for i, transcript_file in enumerate(transcript_files):
try:
# Add file header
outfile.write(f"# Transcript from: {transcript_file.name}\n")
outfile.write("=" * 80 + "\n\n")
# Write file content
with open(transcript_file, 'r', encoding='utf-8') as infile:
outfile.write(infile.read().strip())
# Add separator between files
if i < len(transcript_files) - 1:
outfile.write("\n\n" + "=" * 80 + "\n\n")
print_status(f"Processed: {transcript_file.name}", "success")
except Exception as e:
print_status(f"Error processing {transcript_file.name}: {str(e)}", "error")
return False
print_status(
f"Successfully combined {len(transcript_files)} files into '{self.output_file}'",
"success"
)
return True
except Exception as e:
print_status(f"Error creating output file: {str(e)}", "error")
return False
# ============================================================================
# Main Entry Point
# ============================================================================
def main() -> None:
"""
Main entry point for the transcript combination utility.
Workflow:
1. Initialize TranscriptCombiner
2. Discover available transcript files
3. Combine files if any are found
4. Handle interruptions and errors gracefully
5. Provide clear feedback throughout process
"""
try:
# Initialize combiner with default or custom paths
combiner = TranscriptCombiner()
# Get transcript files
transcript_files = combiner.get_transcript_files()
# Combine files
if transcript_files:
combiner.combine_files(transcript_files)
else:
print_status("Please add transcript files to the 'transcripts' directory", "info")
except KeyboardInterrupt:
print_status("\nOperation interrupted by user", "warning")
except Exception as e:
print_status(f"An unexpected error occurred: {str(e)}", "error")
if __name__ == "__main__":
main()