|
1 | 1 | import sys
|
2 |
| -import csv |
3 | 2 | import json
|
| 3 | +import os |
4 | 4 | import io
|
5 | 5 |
|
6 |
| -from argparse import ArgumentParser |
7 |
| -from argparse import RawTextHelpFormatter |
8 |
| -from copy import deepcopy |
9 |
| -from collections import namedtuple |
10 | 6 | from lxml import etree
|
11 | 7 |
|
12 |
| -Filter = namedtuple("Filter", "XPath jsonattr type force_array") |
| 8 | +from xmlSplit import xmlSplit |
| 9 | +from jsonserialize import json_serialize |
| 10 | +from argumentparser import parser |
| 11 | +from filterprocessor import load_filters |
| 12 | +from mrjobxmljsonifier import MrJobXMLJSONifier |
13 | 13 |
|
14 |
| -parser = ArgumentParser(description='''Filters contents of large xml data sources and produces json-ified results''', formatter_class = RawTextHelpFormatter) |
15 | 14 |
|
16 |
| -parser.add_argument('root', help=''' |
17 |
| -root XPath element name, used to interpret the filter file against |
18 |
| -''') |
19 |
| -parser.add_argument('filter', help=''' |
20 |
| -filter file name, containing column separated filter and map definitions like so: |
21 |
| -[XPath], [target json attribute name], [type], [force array] |
22 |
| -(see sample formatting file included with the source) |
23 |
| -''') |
24 |
| -parser.add_argument('source', help=''' |
25 |
| -source xml formatted file |
26 |
| -''') |
27 |
| -parser.add_argument('--destination', required=False, nargs=1, help=''' |
28 |
| -file name to store the generated json into; if ommited, will output to stdout |
29 |
| -''') |
| 15 | +cargs = parser.parse_args() |
30 | 16 |
|
31 | 17 |
|
32 |
| -args = parser.parse_args() |
| 18 | +if (cargs.split is not None): |
| 19 | + root_elem = cargs.root if cargs.split_root is None else cargs.split_root |
| 20 | + |
| 21 | + splitfilesdirectory = xmlSplit(cargs.source, root_elem, 5000) |
| 22 | + with open("splitfiles.tmp", "w") as split_file_list: |
| 23 | + for path, subdirs, files in os.walk(splitfilesdirectory): |
| 24 | + for filename in files: |
| 25 | + f = os.path.join(path, filename) |
| 26 | + split_file_list.write(os.path.join(os.getcwd(), str(f)) + os.linesep) |
| 27 | + #'-r', 'local', |
| 28 | + mr_jsonifier= MrJobXMLJSONifier( |
| 29 | + args=[ '-r', 'local', |
| 30 | + '--jobconf', 'settings.root=' + cargs.root, |
| 31 | + '--jobconf', 'settings.filter=' + cargs.filter, |
| 32 | + '--jobconf', 'settings.destination=' + cargs.destination[0], |
| 33 | + 'splitfiles.tmp']) |
| 34 | + with mr_jsonifier.make_runner() as runner: |
| 35 | + runner.run() |
| 36 | + os.remove("splitfiles.tmp") |
| 37 | + sys.exit(0) |
| 38 | + |
33 | 39 |
|
34 | 40 | # read filter definitions
|
35 |
| -filters = [] |
36 |
| -with open(args.filter, 'rb') as csvfile: |
37 |
| - filterreader = csv.reader(csvfile, delimiter='|') |
38 |
| - for row in filterreader: |
39 |
| - result = Filter( |
40 |
| - XPath = row[0].strip(), |
41 |
| - jsonattr=row[1].strip(), |
42 |
| - type=row[2].strip(), |
43 |
| - force_array=len(row) > 3) |
44 |
| - filters.append(result) |
45 |
| - |
46 |
| - |
47 |
| -def fast_iter(context, func): |
48 |
| - for event, elem in context: |
49 |
| - func(elem) |
50 |
| - elem.clear() |
51 |
| - del context |
| 41 | +filters = load_filters(cargs.filter) |
| 42 | + |
| 43 | + |
| 44 | + |
52 | 45 |
|
53 | 46 | #attempts to guess at (and convert into) a builtin type based on a string
|
54 |
| -def convert_type(value, type_): |
55 |
| - import importlib |
56 |
| - #assuming built in type |
57 |
| - module = importlib.import_module('__builtin__') |
58 |
| - cls = getattr(module, type_) |
59 |
| - return cls(value) |
60 |
| - |
61 |
| -def json_serialize(elem, outstream): |
62 |
| - result = {} |
63 |
| - for f in filters: |
64 |
| - xp = etree.XPath(f.XPath) |
65 |
| - children = xp(elem) |
66 |
| - attr_val = [] |
67 |
| - for c in children: |
68 |
| - raw_val = c.text if (type(c) is etree._Element) else c |
69 |
| - attr_val.append (convert_type(raw_val, f.type)) |
70 |
| - if not f.force_array and len(children) < 2 and len(attr_val) > 0: |
71 |
| - attr_val = attr_val[0] |
72 |
| - elif len(attr_val) == 0: |
73 |
| - attr_val = None |
74 |
| - result[f.jsonattr] = attr_val |
75 |
| - if result is not None and len(result) > 0: |
76 |
| - outstream.write(unicode(json.dumps(result))) |
77 |
| - outstream.write(u'\n') #easier on the eyes |
78 |
| - |
79 |
| - |
80 |
| -context = etree.iterparse(args.source, events=('end',), tag=args.root) |
81 |
| - |
82 |
| -if args.destination is not None: |
83 |
| - with io.open(args.destination[0], 'w') as file: |
84 |
| - fast_iter(context, |
85 |
| - lambda elem: |
86 |
| - json_serialize(elem, file)) |
| 47 | + |
| 48 | + |
| 49 | +context = etree.iterparse(cargs.source, events=('end',), tag=cargs.root) |
| 50 | + |
| 51 | +if cargs.destination is not None: |
| 52 | + with io.open(cargs.destination[0], 'w') as file: |
| 53 | + json_serialize(context,filters,file) |
87 | 54 | else:
|
88 |
| - fast_iter(context, |
89 |
| - lambda elem: |
90 |
| - json_serialize(elem, sys.stdout)) |
| 55 | + json_serialize(context, filters, sys.stdout) |
91 | 56 |
|
92 | 57 |
|
0 commit comments