crossword-fetch/rename-library.py at main · clodpated/crossword-fetch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
"""
rename-library.py — Rename all .puz files to the human-readable convention:

    YYYY-MM-DD - Publisher - Title - Author.puz

Reads title/author from each .puz file's metadata.
Uses "Untitled" for missing title, "Unlisted" for missing author.

Determines the publisher prefix from either:
  - The old code-based filename (e.g. nyt-20260113.puz → "NY Times")
  - The existing filename for files already in new format

Usage:
    python3 rename-library.py [--dry-run]
"""

import argparse
import filecmp
import re
from pathlib import Path

import puz

# Map xword-dl command codes → outlet_prefix (from xword-dl source)
CODE_TO_PREFIX = {
    "nyt":   "NY Times",
    "nytm":  "NY Times Mini",
    "nytd":  "NY Times Midi",
    "nytv":  "NY Times Variety",
    "lat":   "LA Times",
    "latm":  "LA Times Mini",
    "usa":   "USA Today",
    "uni":   "Universal",
    "nd":    "Newsday",
    "wp":    "WaPo",
    "pop":   "Daily Pop",
    "pzm":   "Puzzmo",
    "pzmb":  "Puzzmo Big",
    "tny":   "New Yorker",
    "tnym":  "New Yorker Mini",
    "atl":   "Atlantic",
    "bill":  "Billboard",
    "vox":   "Vox",
    "vult":  "Vulture",
    "db":    "Daily Beast",
    "wal":   "The Walrus",
    "club":  "Crossword Club",
    # fetch-extras.py sources
    "wsj":   "WSJ",
    "ucsun": "Universal Sunday",
}

# For files already in new format, map known prefix strings back
# (in case they need title/author fixup)
KNOWN_PREFIXES = set(CODE_TO_PREFIX.values())


def safe_filename(s):
    """Remove characters illegal in filenames."""
    return re.sub(r'[/:*?"<>|]', '', s).strip()


def extract_code_and_date(filename):
    """Extract source code and date from old-format filenames like 'nyt-20260113.puz'."""
    m = re.match(r'^([a-z]+)-(\d{4})(\d{2})(\d{2})\.puz$', filename)
    if m:
        code = m.group(1)
        date_str = f"{m.group(2)}-{m.group(3)}-{m.group(4)}"
        return code, date_str
    return None, None


def extract_prefix_from_new_format(filename):
    """Try to extract the publisher prefix from a new-format filename."""
    # Pattern: YYYY-MM-DD - Publisher - Title - Author.puz
    # Also handles older format without " - " after date (space only)
    m = re.match(r'^\d{4}-\d{2}-\d{2}\s+(?:-\s+)?(.+?)\s+-\s+', filename)
    if m:
        prefix = m.group(1)
        if prefix in KNOWN_PREFIXES:
            return prefix
    return None


def build_new_name(date_str, prefix, title, author):
    """Build the new filename."""
    parts = [
        date_str,
        prefix,
        title if title else "Untitled",
        author if author else "Unlisted",
    ]
    return safe_filename(" - ".join(parts)) + ".puz"


def extract_date_from_filename(filename):
    """Extract YYYY-MM-DD date from either old or new format filenames."""
    # New format: "2026-03-13 - Publisher - ..."
    m = re.match(r'^(\d{4}-\d{2}-\d{2})\s', filename)
    if m:
        return m.group(1)
    # Old format: "code-YYYYMMDD.puz"
    m = re.match(r'^[a-z]+-(\d{4})(\d{2})(\d{2})\.puz$', filename)
    if m:
        return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"
    return None


def main():
    parser = argparse.ArgumentParser(description="Rename .puz library to new convention")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show what would be renamed without doing it")
    args = parser.parse_args()

    base = Path.home() / "Crosswords"
    puz_files = sorted(base.rglob("*.puz"))

    renamed = 0
    skipped = 0
    dupes = 0
    seen_targets = set()  # track targets to detect duplicates in dry-run

    for f in puz_files:
        # Skip temp files
        if f.name.startswith("."):
            continue

        # Determine publisher prefix and date from filename
        code, date_str = extract_code_and_date(f.name)
        if code and code in CODE_TO_PREFIX:
            prefix = CODE_TO_PREFIX[code]
        else:
            # Maybe already in new format
            prefix = extract_prefix_from_new_format(f.name)
            date_str = extract_date_from_filename(f.name)
            if not prefix or not date_str:
                print(f"  SKIP (unknown format): {f.name}")
                skipped += 1
                continue

        # Read .puz metadata
        try:
            p = puz.read(str(f))
            title = (p.title or "").strip()
            author = (p.author or "").strip()
        except Exception as e:
            print(f"  WARN reading {f.name}: {e} (using Untitled/Unlisted)")
            title = ""
            author = ""

        new_name = build_new_name(date_str, prefix, title, author)
        new_path = f.parent / new_name

        # Already correct?
        if f.name == new_name:
            seen_targets.add(new_path)
            skipped += 1
            continue

        # Collision — check if files are identical before removing
        if (new_path.exists() and new_path != f) or new_path in seen_targets:
            if new_path.exists() and filecmp.cmp(f, new_path, shallow=False):
                print(f"  DUPE: {f.name} (identical content, removing)")
                if not args.dry_run:
                    f.unlink()
                dupes += 1
            else:
                print(f"  CONFLICT: {f.name} → {new_name} (different content, keeping both)")
                skipped += 1
            continue

        seen_targets.add(new_path)

        if args.dry_run:
            print(f"  {f.name}")
            print(f"    → {new_name}")
        else:
            f.rename(new_path)

        renamed += 1

    action = "Would rename" if args.dry_run else "Renamed"
    print(f"\n{action} {renamed} file(s), skipped {skipped}, dupes removed {dupes}")


if __name__ == "__main__":
    main()