diff --git a/xword_dl/downloader/amuniversaldownloader.py b/xword_dl/downloader/amuniversaldownloader.py index da61753..1885b06 100644 --- a/xword_dl/downloader/amuniversaldownloader.py +++ b/xword_dl/downloader/amuniversaldownloader.py @@ -4,7 +4,6 @@ import time import xml -import puz import requests import xmltodict @@ -60,18 +59,17 @@ def parse_xword(self, xw_data): for field in ['Title', 'Author', 'Editor', 'Copryight']: fetched[field] = unquote(xw_data.get(field, '')).strip() - puzzle = puz.Puzzle() - puzzle.title = fetched.get('Title', '') - puzzle.author = ''.join([fetched.get('Author', ''), + self.puzzle.title = fetched.get('Title', '') + self.puzzle.author = ''.join([fetched.get('Author', ''), ' / Ed. ', fetched.get('Editor', '')]) - puzzle.copyright = fetched.get('Copyright', '') - puzzle.width = int(xw_data.get('Width')) - puzzle.height = int(xw_data.get('Height')) + self.puzzle.copyright = fetched.get('Copyright', '') + self.puzzle.width = int(xw_data.get('Width')) + self.puzzle.height = int(xw_data.get('Height')) solution = xw_data.get('AllAnswer').replace('-', '.') - puzzle.solution = solution + self.puzzle.solution = solution fill = '' for letter in solution: @@ -79,7 +77,7 @@ def parse_xword(self, xw_data): fill += '.' else: fill += '-' - puzzle.fill = fill + self.puzzle.fill = fill across_clues = xw_data['AcrossClue'].splitlines() down_clues = self.process_clues(xw_data['DownClue'].splitlines()) @@ -93,9 +91,9 @@ def parse_xword(self, xw_data): clues = [clue['clue'] for clue in clues_sorted] - puzzle.clues = clues + self.puzzle.clues = clues - return puzzle + return self.puzzle # As of Sept 2023, the JSON data for USA Today is not consistently populated. # I'd rather use the JSON data if possible, but until that's sorted, we can @@ -167,24 +165,22 @@ def parse_xword(self, xw_data): except (xml.parsers.expat.ExpatError, KeyError): raise XWordDLException('Puzzle data malformed, cannot parse.') - puzzle = puz.Puzzle() + self.puzzle.title = unquote(xw.get('Title',[]).get('@v') or '') + self.puzzle.author = unquote(xw.get('Author',[]).get('@v') or '') + self.puzzle.copyright = unquote(xw.get('Copyright',[]).get('@v') or '') - puzzle.title = unquote(xw.get('Title',[]).get('@v') or '') - puzzle.author = unquote(xw.get('Author',[]).get('@v') or '') - puzzle.copyright = unquote(xw.get('Copyright',[]).get('@v') or '') + self.puzzle.width = int(xw.get('Width')['@v']) + self.puzzle.height = int(xw.get('Height')['@v']) - puzzle.width = int(xw.get('Width')['@v']) - puzzle.height = int(xw.get('Height')['@v']) - - puzzle.solution = xw.get('AllAnswer',[]).get('@v').replace('-', '.') - puzzle.fill = ''.join([c if c == '.' else '-' for c in puzzle.solution]) + self.puzzle.solution = xw.get('AllAnswer',[]).get('@v').replace('-', '.') + self.puzzle.fill = ''.join([c if c == '.' else '-' for c in self.puzzle.solution]) xw_clues = sorted(list(xw['across'].values()) + list(xw['down'].values()), key=lambda c: int(c['@cn'])) - puzzle.clues = [unquote(c.get('@c') or '') for c in xw_clues] + self.puzzle.clues = [unquote(c.get('@c') or '') for c in xw_clues] - return puzzle + return self.puzzle class UniversalDownloader(AMUniversalDownloader): diff --git a/xword_dl/downloader/amuselabsdownloader.py b/xword_dl/downloader/amuselabsdownloader.py index 5ed24c8..07822e9 100644 --- a/xword_dl/downloader/amuselabsdownloader.py +++ b/xword_dl/downloader/amuselabsdownloader.py @@ -3,7 +3,6 @@ import json import urllib.parse -import puz import requests import re @@ -174,12 +173,11 @@ def fetch_data(self, solver_url): return xword_data def parse_xword(self, xw_data): - puzzle = puz.Puzzle() - puzzle.title = xw_data.get('title', '').strip() - puzzle.author = xw_data.get('author', '').strip() - puzzle.copyright = xw_data.get('copyright', '').strip() - puzzle.width = xw_data.get('w') - puzzle.height = xw_data.get('h') + self.puzzle.title = xw_data.get('title', '').strip() + self.puzzle.author = xw_data.get('author', '').strip() + self.puzzle.copyright = xw_data.get('copyright', '').strip() + self.puzzle.width = xw_data.get('w') + self.puzzle.height = xw_data.get('h') markup_data = xw_data.get('cellInfos', '') @@ -215,8 +213,8 @@ def parse_xword(self, xw_data): rebus_table += '{:2d}:{};'.format(rebus_index, unidecode(cell)) rebus_index += 1 - puzzle.solution = solution - puzzle.fill = fill + self.puzzle.solution = solution + self.puzzle.fill = fill placed_words = xw_data['placedWords'] @@ -225,23 +223,23 @@ def parse_xword(self, xw_data): clues = [word['clue']['clue'] for word in weirdass_puz_clue_sorting] - puzzle.clues.extend(clues) + self.puzzle.clues.extend(clues) has_markup = b'\x80' in markup has_rebus = any(rebus_board) if has_markup: - puzzle.extensions[b'GEXT'] = markup - puzzle._extensions_order.append(b'GEXT') - puzzle.markup() + self.puzzle.extensions[b'GEXT'] = markup + self.puzzle._extensions_order.append(b'GEXT') + self.puzzle.markup() if has_rebus: - puzzle.extensions[b'GRBS'] = bytes(rebus_board) - puzzle.extensions[b'RTBL'] = rebus_table.encode(puz.ENCODING) - puzzle._extensions_order.extend([b'GRBS', b'RTBL']) - puzzle.rebus() + self.puzzle.extensions[b'GRBS'] = bytes(rebus_board) + self.puzzle.extensions[b'RTBL'] = rebus_table.encode(self.puzzle.encoding) + self.puzzle._extensions_order.extend([b'GRBS', b'RTBL']) + self.puzzle.rebus() - return puzzle + return self.puzzle def pick_filename(self, puzzle, **kwargs): if not self.date and self.id: diff --git a/xword_dl/downloader/basedownloader.py b/xword_dl/downloader/basedownloader.py index 9702a7f..09e8268 100644 --- a/xword_dl/downloader/basedownloader.py +++ b/xword_dl/downloader/basedownloader.py @@ -1,8 +1,8 @@ import urllib.parse from datetime import datetime +import puz import requests -from puz import Puzzle from ..util import ( read_config_values, @@ -42,7 +42,16 @@ def __init__(self, **kwargs): self.session.headers.update(self.settings.get('headers', {})) self.session.cookies.update(self.settings.get('cookies', {})) - def pick_filename(self, puzzle: Puzzle, **kwargs) -> str: + self.puzzle = puz.Puzzle() + + if 'puzzle_v1' not in kwargs: + # this is hack-ily patching constants that puzpy does not + # currently provide a method for setting + self.puzzle.version = b'2.0' + self.puzzle.fileversion = b'2.0\0' + self.puzzle.encoding = 'UTF-8' + + def pick_filename(self, puzzle: puz.Puzzle, **kwargs) -> str: tokens = {'outlet': self.outlet or '', 'prefix': self.outlet_prefix or '', 'title': puzzle.title or '', @@ -77,7 +86,7 @@ def pick_filename(self, puzzle: Puzzle, **kwargs) -> str: return template - def download(self, url: str) -> Puzzle: + def download(self, url: str) -> puz.Puzzle: """Download, parse, and return a puzzle at a given URL.""" solver_url = self.find_solver(url) @@ -86,7 +95,8 @@ def download(self, url: str) -> Puzzle: puzzle = sanitize_for_puzfile( puzzle, - preserve_html=self.settings.get("preserve_html", False) + preserve_html=self.settings.get("preserve_html", False), + demojize=(self.puzzle.encoding != "UTF-8") ) return puzzle @@ -108,7 +118,7 @@ def fetch_data(self, solver_url: str): """ raise NotImplementedError - def parse_xword(self, xw_data) -> Puzzle: + def parse_xword(self, xw_data) -> puz.Puzzle: """Given a blob of crossword data, parse and stuff into puz format. This method is implemented in subclasses based on the differences in diff --git a/xword_dl/downloader/compilerdownloader.py b/xword_dl/downloader/compilerdownloader.py index e39580d..c86b023 100644 --- a/xword_dl/downloader/compilerdownloader.py +++ b/xword_dl/downloader/compilerdownloader.py @@ -1,4 +1,3 @@ -import puz import requests import urllib.parse import xmltodict @@ -50,14 +49,16 @@ def parse_xword(self, xw_data, enumeration=True): xw_metadata = xw_puzzle['metadata'] xw_grid = xw_puzzle['crossword']['grid'] - puzzle = puz.Puzzle() + self.puzzle.title = xw_metadata.get('title') or '' + self.puzzle.author = xw_metadata.get('creator') or '' + self.puzzle.copyright = xw_metadata.get('copyright') or '' - puzzle.title = xw_metadata.get('title') or '' - puzzle.author = xw_metadata.get('creator') or '' - puzzle.copyright = xw_metadata.get('copyright') or '' + self.puzzle.title = xw_metadata.get('title') or '' + self.puzzle.author = xw_metadata.get('creator') or '' + self.puzzle.copyright = xw_metadata.get('copyright') or '' - puzzle.width = int(xw_grid['@width']) - puzzle.height = int(xw_grid['@height']) + self.puzzle.width = int(xw_grid['@width']) + self.puzzle.height = int(xw_grid['@height']) solution = '' fill = '' @@ -65,15 +66,15 @@ def parse_xword(self, xw_data, enumeration=True): cells = {(int(cell['@x']), int(cell['@y'])): cell for cell in xw_grid['cell']} - for y in range(1, puzzle.height + 1): - for x in range(1, puzzle.width + 1): + for y in range(1, self.puzzle.height + 1): + for x in range(1, self.puzzle.width + 1): cell = cells[(x, y)] solution += cell.get('@solution', '.') fill += '.' if cell.get('@type') == 'block' else '-' markup += (b'\x80' if (cell.get('@background-shape') == 'circle') else b'\x00') - puzzle.solution = solution - puzzle.fill = fill + self.puzzle.solution = solution + self.puzzle.fill = fill xw_clues = xw_puzzle['crossword']['clues'] @@ -83,13 +84,13 @@ def parse_xword(self, xw_data, enumeration=True): if c.get("@format") and enumeration else '') for c in sorted(all_clues, key=lambda x: int(x['@number']))] - puzzle.clues = clues + self.puzzle.clues = clues has_markup = b'\x80' in markup if has_markup: - puzzle.extensions[b'GEXT'] = markup - puzzle._extensions_order.append(b'GEXT') - puzzle.markup() + self.puzzle.extensions[b'GEXT'] = markup + self.puzzle._extensions_order.append(b'GEXT') + self.puzzle.markup() - return puzzle + return self.puzzle diff --git a/xword_dl/downloader/guardiandownloader.py b/xword_dl/downloader/guardiandownloader.py index 3984cdc..9b13907 100644 --- a/xword_dl/downloader/guardiandownloader.py +++ b/xword_dl/downloader/guardiandownloader.py @@ -2,7 +2,6 @@ import json import re -import puz import requests from bs4 import BeautifulSoup, Tag @@ -54,16 +53,14 @@ def fetch_data(self, solver_url): return xw_data def parse_xword(self, xw_data): - puzzle = puz.Puzzle() + self.puzzle.author = xw_data.get('creator', {}).get('name') or '' + self.puzzle.height = xw_data.get('dimensions').get('rows') + self.puzzle.width = xw_data.get('dimensions').get('cols') - puzzle.author = xw_data.get('creator', {}).get('name', '') - puzzle.height = xw_data.get('dimensions').get('rows') - puzzle.width = xw_data.get('dimensions').get('cols') - - puzzle.title = xw_data.get('name') or '' + self.puzzle.title = xw_data.get('name') or '' if not all(e.get('solution') for e in xw_data['entries']): - puzzle.title += ' - no solution provided' + self.puzzle.title += ' - no solution provided' self.date = datetime.datetime.fromtimestamp( xw_data['date'] // 1000) @@ -80,21 +77,21 @@ def parse_xword(self, xw_data): solution = '' fill = '' - for y in range(puzzle.height): - for x in range(puzzle.width): + for y in range(self.puzzle.height): + for x in range(self.puzzle.width): sol_at_space = grid_dict.get((x,y), '.') solution += sol_at_space fill += '.' if sol_at_space == '.' else '-' - puzzle.solution = solution - puzzle.fill = fill + self.puzzle.solution = solution + self.puzzle.fill = fill clues = [e.get('clue') for e in sorted(xw_data.get('entries'), key=lambda x: (x.get('number'), x.get('direction')))] - puzzle.clues = clues + self.puzzle.clues = clues - return puzzle + return self.puzzle class GuardianCrypticDownloader(GuardianDownloader): diff --git a/xword_dl/downloader/newyorktimesdownloader.py b/xword_dl/downloader/newyorktimesdownloader.py index 385c21f..dfe41f6 100644 --- a/xword_dl/downloader/newyorktimesdownloader.py +++ b/xword_dl/downloader/newyorktimesdownloader.py @@ -1,7 +1,6 @@ import datetime import urllib.parse -import puz import requests from getpass import getpass @@ -113,22 +112,19 @@ def fetch_data(self, solver_url): return xword_data def parse_xword(self, xw_data): - puzzle = puz.Puzzle() - - puzzle.author = join_bylines(xw_data['constructors'], "and").strip() - puzzle.copyright = xw_data['copyright'] - puzzle.height = int(xw_data['body'][0]['dimensions']['height']) - puzzle.width = int(xw_data['body'][0]['dimensions']['width']) + self.puzzle.author = join_bylines(xw_data['constructors'], "and").strip() + self.puzzle.copyright = xw_data['copyright'] + self.puzzle.height = int(xw_data['body'][0]['dimensions']['height']) + self.puzzle.width = int(xw_data['body'][0]['dimensions']['width']) if not self.date: self.date = datetime.datetime.strptime(xw_data['publicationDate'], '%Y-%m-%d') - puzzle.title = xw_data.get('title') or self.date.strftime( - '%A, %B %d, %Y') + self.puzzle.title = xw_data.get('title') or self.date.strftime('%A, %B %d, %Y') if xw_data.get('notes'): - puzzle.notes = xw_data.get('notes')[0]['text'] + self.puzzle.notes = xw_data.get('notes')[0]['text'] solution = '' fill = '' @@ -161,26 +157,26 @@ def parse_xword(self, xw_data): markup += (b'\x00' if square.get('type', 1) == 1 else b'\x80') - puzzle.solution = solution - puzzle.fill = fill + self.puzzle.solution = solution + self.puzzle.fill = fill if b'\x80' in markup: - puzzle.extensions[b'GEXT'] = markup - puzzle._extensions_order.append(b'GEXT') - puzzle.markup() + self.puzzle.extensions[b'GEXT'] = markup + self.puzzle._extensions_order.append(b'GEXT') + self.puzzle.markup() if any(rebus_board): - puzzle.extensions[b'GRBS'] = bytes(rebus_board) - puzzle.extensions[b'RTBL'] = rebus_table.encode(puz.ENCODING) - puzzle._extensions_order.extend([b'GRBS', b'RTBL']) - puzzle.rebus() + self.puzzle.extensions[b'GRBS'] = bytes(rebus_board) + self.puzzle.extensions[b'RTBL'] = rebus_table.encode(self.puzzle.ENCODING) + self.puzzle._extensions_order.extend([b'GRBS', b'RTBL']) + self.puzzle.rebus() clue_list = xw_data['body'][0]['clues'] clue_list.sort(key=lambda c: (int(c['label']), c['direction'])) - puzzle.clues = [c['text'][0].get('plain') or '' for c in clue_list] + self.puzzle.clues = [c['text'][0].get('plain') or '' for c in clue_list] - return puzzle + return self.puzzle def pick_filename(self, puzzle, **kwargs): if puzzle.title == self.date.strftime('%A, %B %d, %Y'): diff --git a/xword_dl/downloader/puzzmodownloader.py b/xword_dl/downloader/puzzmodownloader.py index 082b3d6..113cd0f 100644 --- a/xword_dl/downloader/puzzmodownloader.py +++ b/xword_dl/downloader/puzzmodownloader.py @@ -2,7 +2,6 @@ import secrets import dateparser -import puz from datetime import datetime, timedelta from zoneinfo import ZoneInfo @@ -114,13 +113,11 @@ def fetch_data(self, solver_url): return xw_data def parse_xword(self, xw_data): - puzzle = puz.Puzzle() - self.date = dateparser.parse(xw_data['dailyTitle']) or \ dateparser.parse(xw_data['dailyTitle'].split('-')[0]) - puzzle.title = xw_data.get('name','') - puzzle.author = join_bylines([a.get('publishingName') or a.get('name') \ + self.puzzle.title = xw_data.get('name','') + self.puzzle.author = join_bylines([a.get('publishingName') or a.get('name') \ for a in xw_data['authors']]) puzzle_lines = [l.strip() for l in xw_data['puzzle'].splitlines()] @@ -159,12 +156,12 @@ def parse_xword(self, xw_data): # less reliable than the other API-provided fields, so we will # only fall back to them. - if k == 'title' and not puzzle.title: - puzzle.title = v - elif k == 'author' and not puzzle.author: - puzzle.author = v + if k == 'title' and not self.puzzle.title: + self.puzzle.title = v + elif k == 'author' and not self.puzzle.author: + self.puzzle.author = v elif k == 'copyright': - puzzle.copyright = v.strip(' ©') + self.puzzle.copyright = v.strip(' ©') elif section == 'grid': if not observed_width: @@ -196,21 +193,21 @@ def parse_xword(self, xw_data): markup += b'\x00' if c in '#.' else b'\x80' - puzzle.height = observed_height - puzzle.width = observed_width - puzzle.solution = solution - puzzle.fill = fill + self.puzzle.height = observed_height + self.puzzle.width = observed_width + self.puzzle.solution = solution + self.puzzle.fill = fill if b'\x80' in markup: - puzzle.extensions[b'GEXT'] = markup - puzzle._extensions_order.append(b'GEXT') - puzzle.markup() + self.puzzle.extensions[b'GEXT'] = markup + self.puzzle._extensions_order.append(b'GEXT') + self.puzzle.markup() clue_list.sort(key=lambda c: (c[1], c[0])) - puzzle.clues = [c[2].split(' ~ ')[0].strip() for c in clue_list] + self.puzzle.clues = [c[2].split(' ~ ')[0].strip() for c in clue_list] - return puzzle + return self.puzzle class PuzzmoBigDownloader(PuzzmoDownloader): diff --git a/xword_dl/downloader/wsjdownloader.py b/xword_dl/downloader/wsjdownloader.py index 91b2da3..f80d9ec 100644 --- a/xword_dl/downloader/wsjdownloader.py +++ b/xword_dl/downloader/wsjdownloader.py @@ -1,7 +1,5 @@ import datetime -import puz - from bs4 import BeautifulSoup, Tag from .basedownloader import BaseDownloader @@ -71,14 +69,13 @@ def parse_xword(self, xw_data): self.date = datetime.datetime.strptime(date_string, '%Y/%m/%d') - puzzle = puz.Puzzle() - puzzle.title = xword_metadata.get('title') or '' - puzzle.author = xword_metadata.get('byline') or '' - puzzle.copyright = xword_metadata.get('publisher') or '' - puzzle.width = int(xword_metadata.get('gridsize').get('cols')) - puzzle.height = int(xword_metadata.get('gridsize').get('rows')) + self.puzzle.title = xword_metadata.get('title') or '' + self.puzzle.author = xword_metadata.get('byline') or '' + self.puzzle.copyright = xword_metadata.get('publisher') or '' + self.puzzle.width = int(xword_metadata.get('gridsize').get('cols')) + self.puzzle.height = int(xword_metadata.get('gridsize').get('rows')) - puzzle.notes = xword_metadata.get('crosswordadditionalcopy') or '' + self.puzzle.notes = xword_metadata.get('crosswordadditionalcopy') or '' solution = '' fill = '' @@ -98,11 +95,11 @@ def parse_xword(self, xw_data): == 'circle') else b'\x00') - puzzle.fill = fill - puzzle.solution = solution + self.puzzle.fill = fill + self.puzzle.solution = solution - if all(c in ['.', 'X'] for c in puzzle.solution): - puzzle.solution_state = 0x0002 + if all(c in ['.', 'X'] for c in self.puzzle.solution): + self.puzzle.solution_state = 0x0002 clue_list = xword_metadata['clues'][0]['clues'] + \ xword_metadata['clues'][1]['clues'] @@ -110,13 +107,13 @@ def parse_xword(self, xw_data): clues = [clue['clue'] for clue in sorted_clue_list] - puzzle.clues = clues + self.puzzle.clues = clues has_markup = b'\x80' in markup if has_markup: - puzzle.extensions[b'GEXT'] = markup - puzzle._extensions_order.append(b'GEXT') - puzzle.markup() + self.puzzle.extensions[b'GEXT'] = markup + self.puzzle._extensions_order.append(b'GEXT') + self.puzzle.markup() - return puzzle + return self.puzzle diff --git a/xword_dl/util/utils.py b/xword_dl/util/utils.py index 9c2fbb5..6f310c9 100644 --- a/xword_dl/util/utils.py +++ b/xword_dl/util/utils.py @@ -51,23 +51,23 @@ def remove_invalid_chars_from_filename(filename: str): return filename -def cleanup(field: str, preserve_html=False): - if preserve_html: - field = unidecode(emoji.demojize(field)).strip() - else: - field = unidecode(emoji.demojize(html2text(field, - bodywidth=0))).strip() +def cleanup(field: str, preserve_html=False, demojize=True): + if not preserve_html: + field = html2text(field, bodywidth=0) + if demojize: + field = unidecode(emoji.demojize(field)) + return field -def sanitize_for_puzfile(puzzle: Puzzle, preserve_html=False) -> Puzzle: - puzzle.title = cleanup(puzzle.title, preserve_html) - puzzle.author = cleanup(puzzle.author, preserve_html) - puzzle.copyright = cleanup(puzzle.copyright, preserve_html) +def sanitize_for_puzfile(puzzle: Puzzle, preserve_html=False, demojize=True): + puzzle.title = cleanup(puzzle.title, preserve_html, demojize) + puzzle.author = cleanup(puzzle.author, preserve_html, demojize) + puzzle.copyright = cleanup(puzzle.copyright, preserve_html, demojize) - puzzle.notes = cleanup(puzzle.notes, preserve_html) + puzzle.notes = cleanup(puzzle.notes, preserve_html, demojize) - puzzle.clues = [cleanup(clue, preserve_html) for clue in puzzle.clues] + puzzle.clues = [cleanup(clue, preserve_html, demojize) for clue in puzzle.clues] return puzzle diff --git a/xword_dl/xword_dl.py b/xword_dl/xword_dl.py index 0cdd484..2fd5098 100644 --- a/xword_dl/xword_dl.py +++ b/xword_dl/xword_dl.py @@ -241,6 +241,15 @@ def main(): will be used)"""), default=None, ) + parser.add_argument( + "-1", + "--v1", + help=textwrap.dedent("""\ + saves the output file as an AcrossLite v1.4 + file (by default, version 2.0 is used)"""), + action="store_true", + default=False, + ) args = parser.parse_args() if args.authenticate and args.source: @@ -275,6 +284,8 @@ def main(): options["preserve_html"] = args.preserve_html if args.output: options["filename"] = args.output + if args.v1: + options["puzzle_v1"] = True if args.date: options["date"] = args.date if args.settings: