DavidBeavan
diff --git a/‎LICENSE
+22 b/‎LICENSE
+22
diff --git a/‎README.md
+21 b/‎README.md
+21
diff --git a/‎bluclobber/harness/__init__.py b/‎bluclobber/harness/__init__.py
diff --git a/‎bluclobber/harness/decomposer.py
+41 b/‎bluclobber/harness/decomposer.py
+41
diff --git a/‎bluclobber/harness/mapreduce.py
+102 b/‎bluclobber/harness/mapreduce.py
+102
diff --git a/‎bluclobber/harness/query.py
+68 b/‎bluclobber/harness/query.py
+68
diff --git a/‎bluclobber/harness/repartition.py
+98 b/‎bluclobber/harness/repartition.py
+98
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 University College London
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
@@ -0,0 +1,21 @@
+# Enabling Complex Analysis of Large Scale Digital Collections
+
+This repository contains code, data, and other outputs from the first phase of '[Enabling Complex Analysis of Large Scale Digital Collections](http://figshare.com/articles/Enabling_Complex_Analysis_of_Large_Scale_Digital_Collections/1319482)', a project funded by the [Jisc Research Data Spring](http://opensource.org/licenses/MIT).
+
+The core project team are:
+
+- PI Melissa Terras (UCL)
+- CI James Baker (British Library)
+- CI David Beavan (UCL)
+- CI James Hetherington (UCL)
+- CI Martin Zaltz Austwick (UCL)
+
+Associated researchers (without who research questions none of this could have happened!) are:
+- Oliver Duke-Williams (UCL)
+- Will Finley (Sheffield)
+- Helen O'Neill (UCL)
+- Anne Welsh (UCL)
+
+All code, data, and other outputs are available for use and reuse under a [MIT Licence](http://opensource.org/licenses/MIT)
+
+For more info on the project see the [UCL DH](http://blogs.ucl.ac.uk/dh/2015/05/07/bluclobber-or-enabling-complex-analysis-of-large-scale-digital-collections/) and [British Library Digital Scholarship](http://britishlibrary.typepad.co.uk/digital-scholarship/) blogs.
@@ -0,0 +1,41 @@
+import logging
+from itertools import islice
+
+class Decomposer(object):
+    def __init__(self, iterable, communicator=None, rank=None, size=None, subsample=1, offsets=None):
+        logger=logging.getLogger('performance')
+        self.logger=logger
+        if not size:
+            if not communicator:
+                logger.debug("Assuming default rank and size")
+                rank=0
+                size=1
+            else:
+                logger.debug("Rank and size from MPI communicator")
+                rank=communicator.rank
+                size=communicator.size
+        self.iterable=iterable
+        if not offsets:
+            self.count=len(iterable)/size
+            if rank==size-1:
+                self.remainder=len(iterable)%size
+            else:
+                self.remainder=0
+            self.start=self.count*rank
+            self.end=self.count*(rank+1)+self.remainder
+            self.step=subsample
+            self.step_offset=self.start%self.step
+            logger.debug("Splitting " +str(len(iterable))+ " items into " + str(size) + " chunks of " + str(self.count))
+            logger.debug("This is chunk " + str(rank) + " from " +str(self.start) + " to " + str(self.end))
+
+    def __str__(self):
+        return "Decomposer of len " + str(len(self)) + " from " +str(self.start)  + " to " + str(self.end) + " in steps " + str(self.step)
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError
+        new_index=self.start+index*self.step+self.step_offset
+        return self.iterable[new_index]
+
+    def __len__(self):
+        return (self.end-1)/self.step-(self.start-1)/self.step
@@ -0,0 +1,102 @@
+from functools import reduce
+from mpi4py import MPI
+from itertools import islice
+from datetime import datetime as time
+import logging
+from collections import defaultdict
+
+from decomposer import Decomposer
+
+class MapReduce(object):
+    def __init__(self, mapper, reducer,  communicator=None, subsample=1, shuffler=None, prepartitioned=False ):
+        self.unsafe_mapper = mapper
+        self.unsafe_reducer = reducer
+        self.unsafe_shuffler = shuffler
+        self.subsample = subsample
+        self.communicator=communicator
+        self.prepartitioned=prepartitioned
+        self.logger=logging.getLogger('performance')
+        # safe reduce
+        def safeReducer(a, b):
+            if a is None:
+                return b
+            if b is None:
+                return a
+            return self.unsafe_reducer(a,b)
+        self.reducer=safeReducer
+        # safe map
+        def safeMap(arg):
+            self.logger.debug("Entered mapper")
+            try:
+                result= self.unsafe_mapper(arg)
+                self.logger.debug("Exiting mapper")
+                return result
+            except Exception as e:
+                self.logger.warn("Problem with map")
+                self.logger.warn(str(e))
+                return None
+        self.mapper=safeMap
+        if shuffler:
+            def safeShuffler(arg, count):
+                try:
+                    return self.unsafe_shuffler(arg, count)
+                except Exception as e:
+                    self.logger.warn("Problem with shuffle")
+                    self.logger.warn(str(e))
+                    return None
+            self.shuffler=safeShuffler
+        else:
+            self.shuffler=None
+
+    def execute(self, data):
+        if self.communicator and self.communicator.size>1:
+            return self.parallel(data)
+        else:
+            return self.serial(data)
+
+    def serial(self, data):
+        try:
+            count=len(data)
+        except AttributeError:
+            count=None
+        subsampled_data=Decomposer(data, subsample=self.subsample)
+        quantities= map(self.mapper, subsampled_data)
+        result = reduce(self.reducer, quantities)
+        return result
+
+    def parallel(self, data):
+        perfLogger=logging.getLogger('performance')
+        # local map
+        if self.prepartitioned:
+            partition=Decomposer(data,subsample=self.subsample)
+        else:
+            partition=Decomposer(data, self.communicator, subsample=self.subsample )
+        perfLogger.info("Built iterator")
+        quantities=map(self.mapper,partition)
+        perfLogger.info("Mapped")
+        local_result=reduce(self.reducer, quantities)
+        perfLogger.info("Local reduce")
+        # reduce under mpi
+        def reduce_arrays(x,y,dtype):
+            # the signature for the user defined op takes a datatype, which we can ignore
+            return self.reducer(x,y)
+        reducer_mpi=MPI.Op.Create(reduce_arrays, True)
+        perfLogger.debug("Local result: "+str(local_result)[0:60])
+        if self.shuffler:
+            perfLogger.info("Shuffling")
+            shuffled=defaultdict(dict)
+            if local_result:
+                for key in local_result:
+                    shuffled[self.shuffler(key, self.communicator.size)][key]=local_result[key]
+            for root in range(self.communicator.size):
+                perfLogger.info("Reducing to rank "+str(root))
+                temp=self.communicator.reduce(shuffled[root],op=reducer_mpi,root=root)
+                if self.communicator.rank==root:
+                    result=temp
+        else:
+            result = self.communicator.reduce(local_result, op=reducer_mpi, root=0)
+            result = self.communicator.bcast(result, root=0)
+        perfLogger.info("Global reduce")
+        
+        reducer_mpi.Free()
+        return result
@@ -0,0 +1,68 @@
+from ..model.corpus import Corpus
+from ..model.dataset import DataSet
+from mpi4py import MPI
+import imp
+import sys
+from datetime import datetime
+from argparse import ArgumentParser
+import logging
+import yaml
+from utils import * 
+
+shuffler=None
+reporter=None
+parser=Corpus
+
+def main():
+    args = clparser(sys.argv[1:])
+    execfile(args.query_path, globals()) # must define 'mapper' and 'reducer'
+                                         # may define shuffler and reporter
+    perfLogger=logging.getLogger('performance')
+    communicator=MPI.COMM_WORLD
+    perfLogger.setLevel(getattr(logging,args.loglevel.upper()))
+    stdout=logging.StreamHandler()
+    stdout.setFormatter(logging.Formatter(str(communicator.rank)+'/'+str(communicator.size)+
+        ' %(levelname)s: %(asctime)s %(message)s'))
+    perfLogger.addHandler(stdout)
+    result = query(mapper, reducer, args.corpus_path, args.downsample, args.bybook,
+                   parser=parser, shuffler=shuffler, reporter=reporter)
+    outpath=args.outpath+'_'+str(MPI.COMM_WORLD.rank)+'.yml'
+    if result:
+        if args.outpath:
+            with open(outpath,'w') as result_file:
+                result_file.write(yaml.safe_dump(result))
+                perfLogger.info("Written result")
+        else:
+            print result
+
+def clparser(commandline):
+    clparser=ArgumentParser(description="Analyse a corpus")
+    clparser.add_argument('query_path',type=str, help='path to python file describing query')
+    clparser.add_argument('corpus_path',type=str, help='path to folder containing zipped corpus')
+    clparser.add_argument('--downsample',type=int, metavar='N', default=1, help='optionally, use only every Nth zipfile')
+    clparser.add_argument('--bybook', action="store_true", default=False)
+    clparser.add_argument('--outpath', default=None, type=str, help = 'output path to yaml dump result')
+    clparser.add_argument('--loglevel', default='info', type=str, help = 'log level (debug, info, warn, error)')
+    args=clparser.parse_args(commandline)
+    return args
+
+def query(mapper, reducer, corpus_path, downsample=1, bybook=False, parser=parser, shuffler=None, reporter=None):
+    communicator=MPI.COMM_WORLD
+    perfLogger=logging.getLogger('performance')
+    if parser==Corpus:
+        corpus=Corpus(corpus_path,communicator)
+        perfLogger.info("Constructed")
+        result = corpus.analyse(mapper, reducer, downsample, bybook, shuffler=shuffler)
+    else:
+        corpus=DataSet(parser, corpus_path, communicator)
+        result = corpus.analyse_by_file(mapper, reducer, downsample, shuffler=shuffler  )
+    perfLogger.info("Finished analysis")
+    if (not shuffler) and communicator.rank !=0:
+        result=None
+    if reporter and result:
+           result=reporter(result)
+           perfLogger.info("Finished postprocessing")
+    return result
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,98 @@
+import os
+import zipfile
+import sys
+from argparse import ArgumentParser
+from itertools import islice
+import shutil
+import tempfile
+import subprocess
+import logging
+from mpi4py import MPI
+import glob
+from ..model.corpus import Corpus
+from decomposer import Decomposer
+
+def main():
+    perfLogger=logging.getLogger('performance')
+    communicator=MPI.COMM_WORLD
+    perfLogger.setLevel(logging.DEBUG)
+    stdout=logging.StreamHandler()
+    stdout.setFormatter(logging.Formatter(str(communicator.rank)+'/'+str(communicator.size)+
+        ' %(levelname)s: %(asctime)s %(message)s'))
+    perfLogger.addHandler(stdout)
+    args = parser(sys.argv[1:])
+    # verify outpath exists or create if not
+    try:
+        os.makedirs(args.out_path)
+    except os.error:
+        pass # Folder exists, nicer in Python 3
+    # restripe if Lustre striping given
+    if args.stripe:
+        subprocess.check_call(['lfs','setstripe','--count', args.stripe, args.out_path])
+    if '.zip' in args.in_path:
+        repartition_from_metazip(args.in_path,args.out_path,args.split)
+    else:
+        repartition(args.in_path,args.out_path,args.split, args.downsample)
+
+def parser(commandline):
+    parser=ArgumentParser(description="Repartition a corpus")
+    parser.add_argument('in_path',type=str, help='path to corpus to repartition')
+    parser.add_argument('out_path',type=str, help='path to folder to contain repartitioned corpus')
+    parser.add_argument('--downsample',type=int, metavar='N', default=1, help='optionally, use only every Nth book')
+    parser.add_argument('--split',type=int, metavar='N', default=64, help='repartition to N zipfiles')
+    parser.add_argument('--stripe',type=int, metavar='N', default=None, help='Lustre striping for output')
+    args=parser.parse_args(commandline)
+    return args
+
+def repartition(in_path, out_path, split, downsample=1, filter=lambda x: True):
+    perfLogger=logging.getLogger('performance')
+    corpus=Corpus(in_path, communicator=MPI.COMM_WORLD)
+    this_processor_out=Decomposer(range(split), communicator=MPI.COMM_WORLD)
+    processor_paths=Decomposer(corpus.paths, communicator=MPI.COMM_WORLD)
+    processor_corpus=Corpus(processor_paths)
+    for chunk_index, chunk in enumerate(this_processor_out):
+        perfLogger.info("Starting output zip "+str(chunk)) 
+        books=Decomposer(processor_corpus, rank=chunk_index, size=len(this_processor_out), subsample=downsample )
+        perfLogger.debug("Will handle "+str(len(books)) +" books.")
+        with zipfile.ZipFile(os.path.join(out_path,'chunk'+str(chunk)+'.zip'),'w',allowZip64=True) as outzip:
+            for book in books:
+                book.load()
+                if not filter(book):
+                    continue
+                info=book.zip_info()
+                # transfer from small zip to bigger zip
+                outzip.writestr(info, book.archive.zip.read(info))
+                for page_code in book.page_codes:
+                    info=book.page_zip_info(page_code)
+                    outzip.writestr(info, book.archive.zip.read(info))
+        perfLogger.info("Completed output zip " +str(chunk))
+    MPI.COMM_WORLD.Barrier()
+
+def repartition_from_metazip(in_zip, out_path, split):
+    tmpdir=tempfile.mkdtemp()
+    this_processor=Decomposer(range(split))
+    with zipfile.ZipFile(in_zip) as metazip:
+        inzips=metazip.infolist()
+        for chunk in this_processor:
+            # open a zip for writing
+            with zipfile.ZipFile(os.path.join(out_path,'chunk'+str(chunk)+'.zip'),'w',allowZip64=True) as outzip:
+                this_chunk=list(islice(metazip.infolist(),chunk,None,split))
+                for archive in this_chunk:
+                    # open a smaller zip
+                    metazip.extract(archive,tmpdir)
+                    small=os.path.join(tmpdir,archive.filename)
+                    # should be able to do this in memory, but
+                    # zipfile doesn't like importing from file-like-object
+                    try:
+                        with zipfile.ZipFile(small) as inzip:
+                        # transfer from small zip to bigger zip
+                            for info in inzip.infolist():
+                                outzip.writestr(info, inzip.read(info))
+                    except zipfile.BadZipfile:
+                        print "Bad file:", archive.filename
+                    os.remove(small)
+    shutil.rmtree(tmpdir)
+    MPI.COMM_WORLD.Barrier()
+
+
+