can now parallelize the work

aausch · aausch · commit ab0d651a03da · 2014-11-05T06:15:23.000-05:00
diff --git a/argumentparser.py b/argumentparser.py
@@ -0,0 +1,31 @@
+from argparse import ArgumentParser
+from argparse import RawTextHelpFormatter
+
+
+parser = ArgumentParser(description='''Filters contents of large xml data sources and produces json-ified results''', formatter_class = RawTextHelpFormatter)
+
+parser.add_argument('root', help='''
+root XPath element name, used to interpret the filter file against
+''')
+parser.add_argument('filter', help='''
+filter file name, containing column separated filter and map definitions like so:
+[XPath], [target json attribute name], [string|int], [force array]
+(see sample formatting file included with the source)
+''')
+parser.add_argument('source', help='''
+source xml formatted file
+''')
+parser.add_argument('--destination', required=False, nargs=1, help='''
+file name to store the generated json into; if ommited, will output to stdout
+''')
+
+parser.add_argument('--split', required=False, help='''
+split the xml file into valid xml files containing elements at root
+leaves the split documents in a directory named: [source][timestamp]
+performs the json conversion using a mrjob job, running on the split xml
+''')
+
+parser.add_argument('--split_root', required=False, help='''
+root node to split the xml on, if different from the root node to use when filtering
+''')
+
diff --git a/filterprocessor.py b/filterprocessor.py
@@ -0,0 +1,22 @@
+import csv
+import io
+import os
+
+from collections import namedtuple
+
+Filter = namedtuple("Filter", "XPath jsonattr type force_array")
+
+
+
+def load_filters(filterFileName):
+  with open(filterFileName, 'rb') as csvfile:
+    filters = []
+    filterreader = csv.reader(csvfile, delimiter='|')
+    for row in filterreader:
+        result = Filter(
+            XPath = row[0].strip(),
+            jsonattr=row[1].strip(),
+            type=row[2].strip(),
+            force_array=len(row) > 3)
+        filters.append(result)
+    return filters
diff --git a/jsonserialize.py b/jsonserialize.py
@@ -0,0 +1,41 @@
+import json
+import importlib
+
+from lxml import etree
+
+def fast_iter(context, func):
+    for event, elem in context:
+        func(elem)
+        elem.clear()
+    del context
+
+def convert_type(value, type_):
+    #assuming built in type
+    module = importlib.import_module('__builtin__')
+    cls = getattr(module, type_)
+    return cls(value)
+
+
+def inner_json_serialize(elem, filters, outstream):
+    result = {}
+    for f in filters:
+        xp = etree.XPath(f.XPath)
+        children = xp(elem)
+        attr_val = []
+        for c in children:
+            raw_val =  c.text if (type(c) is etree._Element)  else c
+            attr_val.append ( convert_type(raw_val, f.type) )
+        if not f.force_array and len(children) < 2 and len(attr_val) > 0:
+            attr_val = attr_val[0]
+        elif len(attr_val) == 0:
+            attr_val = None
+        result[f.jsonattr] = attr_val
+    if result is not None and len(result) > 0:
+        outstream.write(unicode(json.dumps(result)))
+        outstream.write(u'\n')
+
+
+def json_serialize(context,filters,outstream):
+  fast_iter(context,
+    lambda elem:
+      inner_json_serialize(elem, filters, outstream))
diff --git a/mrjobxmljsonifier.py b/mrjobxmljsonifier.py
@@ -0,0 +1,45 @@
+import io
+import os
+
+from lxml import etree
+from mrjob.job import MRJob
+from mrjob.compat import get_jobconf_value
+
+from jsonserialize import json_serialize
+from filterprocessor import load_filters
+
+class MrJobXMLJSONifier(MRJob):
+  def mapper(self, _, line):
+    filters = load_filters(get_jobconf_value("settings.filter"))
+    context = etree.iterparse(line, events=('end',), tag=get_jobconf_value("settings.root"))
+    result_file = line + ".mapped"
+    with io.open(result_file, 'w') as file:
+      json_serialize(context,filters,file)
+    yield("key", result_file)
+
+  def reducer(self, key, file_iterator):
+    files = list(file_iterator)
+
+    result_file = get_jobconf_value("settings.destination")
+    if result_file is not None:
+      with open(result_file, "wb") as outfile:
+        for f in files:
+          with open(f, "rb") as infile:
+              outfile.write(infile.read())
+          os.remove(f)
+      yield key, result_file
+    else:
+      for f in files:
+        with open(f, "rb") as infile:
+          sys.stdout.write(infile.read())
+
+
+
+
+  def steps(self):
+    return [self.mr(mapper=self.mapper,reducer=self.reducer)]
+
+
+
+if __name__ == "__main__":
+    MrJobXMLJSONifier.run()
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 lxml==3.4.0
+mrjob==0.4.2
 xmlutils==1.1
diff --git a/test/test_ClinVarFullRelease_2014_corrected.sh b/test/test_ClinVarFullRelease_2014_corrected.sh
@@ -1,10 +1,10 @@
 #!/bin/bash -v
 
-#curl "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2014-08.xml.gz" -o ClinVarFullRelease_2014-08.xml.gz
-#gunzip ClinVarFullRelease_2014-08.xml.gz
-
 STARTTIME=$(date +%s)
-python ../xmljsonifier.py "ClinVarSet" clinvar_filter_corrected.txt ClinVarFullRelease_2014-08.xml --destination=output.xml 
+curl "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2014-08.xml.gz" -o ClinVarFullRelease_2014-08.xml.gz
+gunzip ClinVarFullRelease_2014-08.xml.gz
+
+python ../xmljsonifier.py "ClinVarSet" clinvar_filter_corrected.txt ClinVarFullRelease_2014-08.xml --destination=output.json
 ENDTIME=$(date +%s)
 echo "$(($ENDTIME - $STARTTIME)) seconds to complete jsonifying..."
 
diff --git a/test/test_ClinVarFullRelease_2014_split.sh b/test/test_ClinVarFullRelease_2014_split.sh
@@ -0,0 +1,12 @@
+#!/bin/bash -v
+
+#curl "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2014-08.xml.gz" -o ClinVarFullRelease_2014-08.xml.gz
+#gunzip ClinVarFullRelease_2014-08.xml.gz
+
+STARTTIME=$(date +%s)
+python ../xmljsonifier.py "ReferenceClinVarAssertion" "`pwd`/clinvar_filter_corrected.txt" ClinVarFullRelease_2014-08.xml --destination="`pwd`/output.json" --split=true --split_root="ClinVarSet"
+ENDTIME=$(date +%s)
+echo "$(($ENDTIME - $STARTTIME)) seconds to complete jsonifying..."
+
+grep -o -w "<ReferenceClinVarAssertion" ClinVarFullRelease_2014-08.xml| wc -w
+wc -l output.json
diff --git a/xmlSplit.py b/xmlSplit.py
@@ -0,0 +1,45 @@
+import io
+import os
+import time
+
+from lxml import etree
+from lxml.etree import tostring
+
+def create_and_set_output_dir (name):
+    output_directory = name + time.strftime("%Y%m%d-%H%M%S")
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+    return output_directory
+
+def fast_iter(context, batch_size, output_directory, func):
+    batch = 0
+    count = 0
+
+    for event, elem in context:
+        if count == 0:
+            with io.open(os.path.join(output_directory, str(batch)), 'a') as outstream:
+                outstream.write(u'<split_root>\n')
+        func(elem, batch)
+        elem.clear()
+        count = count + 1
+        if count == batch_size:
+            with io.open(os.path.join(output_directory, str(batch)), 'a') as outstream:
+                outstream.write(u'\n</split_root>')
+            batch = batch + 1
+            count = 0
+    if count > 0:
+      with io.open(os.path.join(output_directory, str(batch)), 'a') as outstream:
+                outstream.write(u'\n</split_root>')
+    del context
+
+def dump_to_file(elem, output_directory, batch):
+    with io.open(os.path.join(output_directory, str(batch)), 'a') as outstream:
+        outstream.write(unicode(tostring(elem, with_tail=False)))
+
+def xmlSplit(xmlFileName, root, batch_size):
+    output_directory = create_and_set_output_dir(xmlFileName)
+    context = etree.iterparse(xmlFileName, events=('end',), tag=root)
+    fast_iter(context, batch_size, output_directory,
+            lambda elem, count:
+                dump_to_file(elem, output_directory, count))
+    return output_directory
diff --git a/xmljsonifier.py b/xmljsonifier.py
@@ -1,92 +1,57 @@
 import sys
-import csv
 import json
+import os
 import io
 
-from argparse import ArgumentParser
-from argparse import RawTextHelpFormatter
-from copy import deepcopy
-from collections import namedtuple
 from lxml import etree
 
-Filter = namedtuple("Filter", "XPath jsonattr type force_array")
+from xmlSplit import xmlSplit
+from jsonserialize import json_serialize
+from argumentparser import parser
+from filterprocessor import load_filters
+from mrjobxmljsonifier import MrJobXMLJSONifier
 
-parser = ArgumentParser(description='''Filters contents of large xml data sources and produces json-ified results''', formatter_class = RawTextHelpFormatter)
 
-parser.add_argument('root', help='''
-root XPath element name, used to interpret the filter file against
-''')
-parser.add_argument('filter', help='''
-filter file name, containing column separated filter and map definitions like so: 
-[XPath], [target json attribute name], [type], [force array]
-(see sample formatting file included with the source)
-''')
-parser.add_argument('source', help='''
-source xml formatted file
-''')
-parser.add_argument('--destination', required=False, nargs=1, help='''
-file name to store the generated json into; if ommited, will output to stdout
-''')
+cargs = parser.parse_args()
 
 
-args = parser.parse_args()
+if (cargs.split is not None):
+    root_elem = cargs.root if cargs.split_root is None else cargs.split_root
+
+    splitfilesdirectory = xmlSplit(cargs.source, root_elem, 5000)
+    with open("splitfiles.tmp", "w") as split_file_list:
+      for path, subdirs, files in os.walk(splitfilesdirectory):
+        for filename in files:
+          f = os.path.join(path, filename)
+          split_file_list.write(os.path.join(os.getcwd(), str(f)) + os.linesep)
+          #'-r', 'local',
+    mr_jsonifier= MrJobXMLJSONifier(
+      args=[ '-r', 'local',
+             '--jobconf', 'settings.root=' + cargs.root,
+             '--jobconf', 'settings.filter=' + cargs.filter,
+             '--jobconf', 'settings.destination=' + cargs.destination[0],
+             'splitfiles.tmp'])
+    with mr_jsonifier.make_runner() as runner:
+      runner.run()
+    os.remove("splitfiles.tmp")
+    sys.exit(0)
+
 
 # read filter definitions
-filters = []
-with open(args.filter, 'rb') as csvfile:
-    filterreader = csv.reader(csvfile, delimiter='|')
-    for row in filterreader:
-        result = Filter(
-            XPath = row[0].strip(), 
-            jsonattr=row[1].strip(), 
-            type=row[2].strip(), 
-            force_array=len(row) > 3)
-        filters.append(result)
-
-
-def fast_iter(context, func):
-    for event, elem in context:
-        func(elem)
-        elem.clear()
-    del context
+filters = load_filters(cargs.filter)
+
+
+
 
 #attempts to guess at (and convert into) a builtin type based on a string
-def convert_type(value, type_):
-    import importlib
-    #assuming built in type
-    module = importlib.import_module('__builtin__')
-    cls = getattr(module, type_)
-    return cls(value)
-
-def json_serialize(elem, outstream):
-    result = {}
-    for f in filters:
-        xp = etree.XPath(f.XPath) 
-        children = xp(elem)
-        attr_val = []
-        for c in children:
-            raw_val =  c.text if (type(c) is etree._Element)  else c
-            attr_val.append (convert_type(raw_val, f.type))
-        if not f.force_array and len(children) < 2 and len(attr_val) > 0:
-            attr_val = attr_val[0]
-        elif len(attr_val) == 0:
-            attr_val = None
-        result[f.jsonattr] = attr_val
-    if result is not None and len(result) > 0:
-        outstream.write(unicode(json.dumps(result)))
-        outstream.write(u'\n') #easier on the eyes
-
-
-context = etree.iterparse(args.source, events=('end',), tag=args.root)
-
-if args.destination is not None:
-    with io.open(args.destination[0], 'w') as file:
-        fast_iter(context, 
-            lambda elem:
-                json_serialize(elem, file))
+
+
+context = etree.iterparse(cargs.source, events=('end',), tag=cargs.root)
+
+if cargs.destination is not None:
+    with io.open(cargs.destination[0], 'w') as file:
+      json_serialize(context,filters,file)
 else:
-    fast_iter(context, 
-       lambda elem:
-           json_serialize(elem, sys.stdout))
+  json_serialize(context, filters, sys.stdout)
 
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`lxml==3.4.0`
	`2`	`+mrjob==0.4.2`
`2`	`3`	`xmlutils==1.1`