diff --git a/PyQT4/Karthika.exe b/PyQT4/Karthika.exe
new file mode 100644
index 0000000..560dae7
Binary files /dev/null and b/PyQT4/Karthika.exe differ
diff --git a/PyQT4/Karthika.py b/PyQT4/Karthika.py
new file mode 100644
index 0000000..2952128
--- /dev/null
+++ b/PyQT4/Karthika.py
@@ -0,0 +1,637 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from PyQt4 import QtCore, QtGui
+from PyQt4.QtGui import QListWidgetItem
+from PyQt4.QtCore import Qt
+import os, sys
+import PyQt4.uic
+import re
+import codecs
+from xml.dom.minidom import parseString
+from xml.parsers.expat import ExpatError
+try:
+ # import mod.bz2 as bz2
+ import bz2
+except ImportError:
+ import mod64.bz2 as bz2
+import random
+import mparser
+import mathexp
+import convert_idx_s
+from searcher import *
+from QTextBrowser2 import *
+
+##
+## Main class : GUI
+##
+
+class MainViewer(QtGui.QDialog):
+ def __init__(self,parent = None,name = None,modal = 0,fl = 0):
+ # Some options :
+ self.loadTabInBackground = True
+ latexRendering = True
+ fontSize = 9
+ self.smallerLayout = False
+ keepMathImageFiles = False # if True, do not erase old images when starting the program
+ # End
+
+ debugButton = False # hide debugs stuffs, only for normal layout
+
+ QtGui.QDialog.__init__(self, parent)
+ self.checkBox1 = None
+ self.textLabel1 = None
+
+ if self.smallerLayout == True:
+ # PyQt4.uic.loadUi(os.path.join(os.path.dirname(__file__), 'form3_smaller.ui'), self)
+ # PyQt4.uic.loadUi(os.path.join(os.path.dirname('C:/Sites\\wikipediaDumpReader\\'), 'form3_smaller.ui'), self)
+ PyQt4.uic.loadUi(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), "form3_smaller.ui"), self)
+ else:
+ # PyQt4.uic.loadUi(os.path.join(os.path.dirname(__file__), 'form3.ui'), self)
+ # PyQt4.uic.loadUi(os.path.join(os.path.dirname('C:/Sites\\wikipediaDumpReader\\'), 'form3.ui'), self)
+ PyQt4.uic.loadUi(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), "form3.ui"), self)
+ # self.checkBox3.setChecked(False)
+ self.checkBox3.setChecked(True)
+ self.checkBox3.hide()
+ # self.pushButton1.hide()
+ self.pushButton9.hide()
+ self.listBox3.hide()
+ # self.inputbz2Archive, self.outidxname, self.outblockname, self.outidxname_s = wikiDataProcessing()
+ '''self.inputbz2Archive = '/home/ashok/tawiktionary-20120418-pages-articles.xml.bz2'
+ self.outidxname = '/home/ashok/tawiktionary-20120418-pages-articles.idx.gz'
+ self.outblockname = '/home/ashok/tawiktionary-20120418-pages-articles.blocks.idx'
+ self.outidxname_s = '/home/ashok/tawiktionary-20120418-pages-articles.idx_s.gz'
+ self.idx_s = self.outidxname_s and convert_idx_s.load_entrylist_table(self.outidxname_s)
+ if "wiktionary" in self.inputbz2Archive:
+ self.wiktionary = True
+ else:
+ self.wiktionary = False
+ self.blocksaddr, self.bitaddr = readBlocksAddr(self.outblockname)'''
+ self.wiktionary = True
+
+ self.cachedArticles = {}
+ self.mathRendererInstance = None
+ self.cache_font = QtGui.QFont()
+ if fontSize > 0: # you may want to disable explicit font-setting if the .ui ones should prevail
+ self.setFontSize(fontSize)
+ print self.checkBox1
+ if debugButton == False:
+ if self.checkBox1:
+ self.checkBox1.setChecked(False)
+ self.checkBox1.hide()
+ elif self.checkBox1 != None:
+ self.checkBox1.setChecked(False)
+ if latexRendering:
+ try:
+ self.mathRendererInstance = mathexp.MathExp(keepMathImageFiles)
+ except:
+ print 'Error while loading math parser'
+ #try:
+ # self.textEdit1.setOpenLinks(False)
+ #except AttributeError:
+ # print "Qt < 4.3 found"
+ '''titleEntry = u"Wikipedia"
+ name = self.cacheArticle(titleEntry)
+ self.displayCachedEntry(name)'''
+ self.textLabel1.setText('')
+
+ def setFontSize(self, size):
+ self.cache_font.setPointSize(size)
+ self.textEdit1.setFont(self.cache_font)
+
+ def launchSearch(self):
+ t = unicode(self.lineEdit1.text())
+ if t == "":
+ return
+ print "LaunchSearch" , `t`
+ # need to find a way to populate the list with the dict, not just the key alone
+ self.listBox1.clear()
+ self.Matches = search_for(t)
+ for r in self.Matches:
+ # self.listBox1.addItem(r['word'])
+ listWidgetItem = QListWidgetItem(r['word'], self.listBox1)
+ listWidgetItem.setData(Qt.UserRole, r['meaning'])
+
+ def loadTextblockRaw(self, block, start,length):
+ f = bz2.BZ2File( self.inputbz2Archive )
+ L, olength = f.loadBlock(self.blocksaddr[block], self.bitaddr[block], start, length)
+ if olength - start < length:
+ # If we're on a block boundary, continue to next file
+ L2, _ = f.loadBlock(self.blocksaddr[block+1], self.bitaddr[block+1], 0, length-(olength - start))
+ L = L[:olength-start] + L2[:start + length-olength]
+ try:
+ D = parseString("\n" + L)
+ except ExpatError:
+ return "Error : can't load this article - sorry", "error"
+
+ n = D.getElementsByTagName('title')
+ title = n[0].firstChild.nodeValue
+
+ n = D.getElementsByTagName('text')
+ t = n[0].firstChild.nodeValue
+ return t, title
+
+ def loadTextblock(self, block, start,length):
+ """ read a block of wiki-text, then filter-it through the regex-made converter"""
+ t, title = self.loadTextblockRaw(block, start, length)
+ if self.checkBox1 and self.checkBox1.isChecked():
+ self.textEdit2.setPlainText(t)
+
+ if t[0:9].upper() != "#REDIRECT": # avoid filtering if a redirection
+ t = filterMarkup(t, self.mathRendererInstance)
+ return t, title
+
+ def cacheARandomArticle(self):
+ # TODO: make it cleaner, make it error-proof
+ print "Choosing at random (this may be slow :-)"
+ idxname = self.outidxname.encode('utf-8')
+ if not hasattr(self, "numArticles"):
+ self.numArticles = int(os.popen('zcat "'+idxname+'" | wc -l', 'r').readline()[:-1])
+ # avoid choosing something containing":", avoid REDIRECT
+ t, candidate = "", ":"
+ while t[0:9].upper() == "#REDIRECT" or candidate.find(':') > -1:
+ i = random.randint(1, self.numArticles)
+ l = os.popen("zcat '"+idxname+"' | head -n %d | tail -n 1" % i).readline()
+ block, offset, leng = [int(x) for x in l.split()[-3:]]
+ t, candidate = self.loadTextblock(block, offset, leng)
+ titleEntry = candidate
+ self.cachedArticles[titleEntry] = t
+ self.listBox3.insertItem(self.listBox3.currentRow() + 1, titleEntry)
+ return titleEntry
+
+ def cacheArticle(self, titleEntry):
+ self.textEdit1.viewport().setCursor(QtCore.Qt.BusyCursor)
+ self.setCursor(QtCore.Qt.BusyCursor)
+ url = titleEntry.split('#')
+ titleEntry = url[0] # todo : to do
+
+ # special feature: load a random article.
+ if titleEntry == "":
+ return self.cacheARandomArticle()
+ # when loading an article, title have uppercased first word
+ if not titleEntry[0].isupper() and not self.wiktionary:
+ titleEntry = titleEntry[0].upper() + titleEntry[1:]
+
+ if not self.cachedArticles.has_key(titleEntry):
+ latin1 = titleEntry.encode('utf-8')
+ idxname = self.outidxname.encode('utf-8')
+ #print ('zgrep "^' + latin1 + '\t" ' + idxname, 'r')
+ if self.idx_s: # new style find
+ l = convert_idx_s.load_entry_addr(latin1, self.idx_s, idxname) or ""
+ else:
+ l = os.popen('zgrep "^' + latin1 + '\t" ' + idxname, 'r').readline()
+ # i don't utf_8_decode because i'm only interested in numbers
+ try:
+ block, offset, leng = [int(x) for x in l.split()[-3:]]
+ except ValueError:
+ if not self.smallerLayout:
+ self.textLabel1.setText(u"Article Not Found : " + titleEntry)
+ self.textEdit1.viewport().setCursor(QtCore.Qt.ArrowCursor)
+ self.setCursor(QtCore.Qt.ArrowCursor)
+ return None
+ try:
+ t, parsedtitle = self.loadTextblock(block, offset, leng)
+ except StandardError:
+ t = "Error while retrieving data from the dump for this article
\nSorry
\n(debug informations might be available in a terminal output)"
+
+ if t[0:9].upper() == "#REDIRECT":
+ destArticle = t[t.index('[[')+2:t.index(']]')]
+ if destArticle != titleEntry:
+ print '%s redirects to %s' % (`titleEntry`, `destArticle`)
+ return self.cacheArticle(destArticle)
+ else:
+ print "WARNING: Redirection to itself (%s). Abort" % `titleEntry`
+
+ self.cachedArticles[titleEntry] = t
+ self.listBox3.insertItem(self.listBox3.currentRow() + 1, titleEntry)
+
+ self.textEdit1.viewport().setCursor(QtCore.Qt.ArrowCursor)
+ self.setCursor(QtCore.Qt.ArrowCursor)
+ return titleEntry
+
+ def displayCachedEntry(self, titleEntry):
+ if not titleEntry:
+ return
+ if not self.smallerLayout:
+ self.textEdit1.setHtml(self.cachedArticles[titleEntry])
+ self.textLabel1.setText("%s
" % titleEntry)
+ else:
+ self.textEdit1.setHtml(("%s
" % titleEntry) + self.cachedArticles[titleEntry])
+ self.currentlyDisplayedArticle = titleEntry
+
+ i = self.listBox3.findItems(titleEntry, Qt.MatchExactly)[0]
+ self.listBox3.setCurrentItem(i)
+
+ @QtCore.pyqtSignature("")
+ def on_pushButton9_clicked(self):
+ self.closeCurrentPage()
+
+ def closeCurrentPage(self):
+ row = self.listBox3.currentRow()
+ if self.listBox3.count() > 1:
+ del self.cachedArticles[self.currentlyDisplayedArticle]
+ self.listBox3.takeItem(row)
+
+ def on_listBox3_currentTextChanged(self, a0):
+ '''if not a0.isNull():
+ name = self.cacheArticle(unicode(a0))
+ self.displayCachedEntry(name)'''
+
+ def on_listBox1_currentTextChanged(self, a0):
+ if not a0.isNull():
+ self.lineEdit1.setText(a0)
+ self.textEdit1.setHtml('')
+ curItem = self.listBox1.currentItem()
+ word = curItem.text().toUtf8()
+ meaning = curItem.data(Qt.UserRole).toString()
+ meaning_text = get_markup(unicode(word, 'utf-8'), str(meaning))
+ t = filterMarkup(meaning_text, self.mathRendererInstance)
+ self.textEdit1.setHtml(t)
+ self.textLabel1.setText("%s
" % a0)
+
+ def on_listBox1_itemDoubleClicked(self, widgetitem):
+ curItem = self.listBox1.currentItem()
+ word = curItem.text().toUtf8()
+ meaning = curItem.data(Qt.UserRole).toString()
+ meaning_text = get_markup(unicode(word, 'utf-8'), str(meaning))
+ t = filterMarkup(meaning_text, self.mathRendererInstance)
+ self.textEdit1.setHtml(t)
+
+ @QtCore.pyqtSignature("")
+ def on_pushButton1_clicked(self):
+ self.textLabel1.setText('')
+ self.textEdit1.setHtml('')
+ self.launchSearch()
+
+ @QtCore.pyqtSignature("")
+ def on_pushButton10_clicked(self):
+ h = helpDlg(self)
+ h.exec_()
+
+ s = unicode(self.lineEdit1.text())
+ if self.listBox1.count() > 0:
+ i = self.listBox1.findItems(s, Qt.MatchExactly)[0]
+ self.listBox1.setCurrentItem(i)
+ curItem = self.listBox1.currentItem()
+ word = curItem.text().toUtf8()
+ meaning = curItem.data(Qt.UserRole).toString()
+ meaning_text = get_markup(unicode(word, 'utf-8'), str(meaning))
+ t = filterMarkup(meaning_text, self.mathRendererInstance)
+ self.textEdit1.setHtml(t)
+ self.textLabel1.setText("%s
" % curItem.text())
+ self.listBox1.setFocus()
+ else:
+ self.textLabel1.setText(u"Article Not Found : " + s)
+
+ @QtCore.pyqtSignature("")
+ def on_pushButton3_clicked(self):
+ self.textLabel1.setText('')
+ self.textEdit1.setHtml('')
+ self.launchSearch()
+
+ @QtCore.pyqtSignature("")
+ def on_pushButton2_clicked(self):
+ # debug button
+ # execute the code on the regex
+ k = filterMarkup(unicode(self.textEdit2.toPlainText()), self.mathRendererInstance)
+ self.textEdit1.setHtml(k)
+ self.textEdit3.setPlainText(k)
+ return
+ t = unicode(self.textEdit2.toPlainText())
+ code = unicode(self.textEdit3.toPlainText())
+ print "Before:", t
+ print "Eval:, ", code
+ exec(code)
+ print "After:", t
+ self.textEdit1.setHtml(t)
+
+ # @QtCore.pyqtSignature("")
+ # def on_pushButton10_clicked(self):
+
+ def on_textEdit1_anchorClicked(self, qurl):
+ self.lineEdit1.setText(unicode(qurl.toString()))
+ self.pushButton1.click()
+
+ def on_lineEdit1_returnPressed(self):
+ self.pushButton1.click()
+
+##
+## Wiki Archive IO settings & init
+##
+
+def readBlocksAddr(outblockname):
+ #print "Loading block addr..."
+ blocksaddr = []
+ bitaddr = []
+ for k in open(outblockname):
+ a, b, c = k.split()
+ blocksaddr.append(int(b))
+ bitaddr.append(int(c))
+ return blocksaddr, bitaddr
+
+class helpDlg(QtGui.QDialog):
+ def __init__(self, parent=None):
+ QtGui.QDialog.__init__(self, parent)
+ self.setWindowTitle(unicode('உதவி', 'utf-8'))
+ self.resize(200,200)
+
+"""Please select a dump file from the list, or load a new one
+You can download a dump from your language from the wikipedia web server at :
+http://download.wikimedia.org/backup-index.html
+They are files generally named like pages-articles.xml.bz2."""
+class LoaderBox(QtGui.QDialog):
+ def __init__(self, *kargs):
+ QtGui.QDialog.__init__(self, *kargs)
+ PyQt4.uic.loadUi(os.path.join(os.path.dirname('C:\Sites\wikipediaDumpReader-0.2.10'), 'loader.ui'), self)
+ self.connect(self.buttonBox.button(QtGui.QDialogButtonBox.Open), QtCore.SIGNAL("clicked()"), self.load)
+ self.connect(self.buttonBox.button(QtGui.QDialogButtonBox.Ok), QtCore.SIGNAL("clicked()"), self.ok)
+ self.conffile = os.path.join(os.environ['HOME'], '.wikipediadumpreaderrc')
+ try:
+ self.l = [x.rstrip() for x in codecs.open(self.conffile, encoding='utf-8') if x]
+ except:
+ self.l = []
+ self.comboBox.addItems(self.l)
+ def load(self):
+ t = QtGui.QFileDialog.getOpenFileName(None, "Opening a dump", os.curdir, 'Wikipedia dump (*articles.xml.bz2 *.xml.bz2)')
+ if not t.isNull():
+ self.l.insert(0, unicode(t))
+ self.comboBox.insertItem(0, t)
+ self.comboBox.setCurrentIndex(0)
+ def accept(self): pass
+ def ok(self):
+ if len(self.l):
+ self.l = [self.l.pop(self.comboBox.currentIndex())] + self.l
+ codecs.open(self.conffile, 'w', encoding='utf-8').write("\n".join(self.l))
+ QtGui.QDialog.accept(self)
+
+def wikiDataProcessing():
+ if len(sys.argv) == 2:
+ inputbz2Archive = sys.argv[1].decode('utf-8')
+ else:
+ v = LoaderBox()
+ if v.exec_():
+ inputbz2Archive = v.l[0]
+ del v
+ else:
+ sys.exit(0)
+
+ filesize = int(os.stat(inputbz2Archive)[6] / 1024 / 1024) + 1
+
+ filename = inputbz2Archive
+ if filename.endswith('.xml.bz2'):
+ outidxname = filename[:-8] + '.idx.gz'
+ outblockname = filename[:-8] + '.blocks.idx'
+ outidxname_s = filename[:-8] + '.idx_s.gz'
+ else:
+ print "filename : ", `filename`
+ print "file should be a wikipedia .xml.bz2 named file - Aborting"
+ sys.exit(0)
+
+ # Build index if needed
+ index_exists = os.path.exists(outidxname)
+ # First pass: build main (large) index
+ if not index_exists or not os.path.exists(outblockname) or \
+ os.stat(inputbz2Archive)[8] > os.stat(outidxname)[8]:
+
+ dialogBuild = QtGui.QProgressDialog("Building the index", "Abort", 0, filesize)
+ dialogBuild.show()
+
+ def callback(currentPos):
+ dialogBuild.setValue(currentPos)
+ QtGui.qApp.processEvents()
+ if (dialogBuild.wasCanceled()):
+ QtGui.QMessageBox.critical(None, "Abort", "Index creation canceled - it may be incomplete or corrupt\nYou might want to manually delete the two files: \n%s\n%s" % (outidxname, outblockname))
+ sys.exit(0)
+ try:
+ mparser.buildIndex(inputbz2Archive, outidxname, outblockname, callback)
+ except EOFError, e:
+ QtGui.QMessageBox.critical(None, "Index creation problem", unicode(e))
+ # sanity check on the indexes files : must be >0 bytes, and must been both written within 1 second range
+ s1, s2 = os.stat(outblockname), os.stat(outidxname)
+ if s1[6] == 0 or s2[6] == 0 or abs(s1[8] - s2[8]) > 1:
+ QtGui.QMessageBox.warning(None, "Continue anyway", "The index files are either empty or have different writing times - it may indicate a problem, such as an interrupted previous indexing.\nTo force re-indexing, you can manually delete the two files: \n%s\n%s.\nThe program will now continue anyway." % (outidxname, outblockname))
+
+ # Second pass : entry-list sorting only
+ if not os.path.exists(outidxname_s): # interactive idx_s creation
+ if index_exists and QtGui.QMessageBox.question(None, "Old index format found", "The index format can be upgraded to load articles quicker.\nUpgrading may take 1 to 5 minutes depending on the computer and language.\nDo you want to do this now ?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) == QtGui.QMessageBox.No:
+ outidxname_s = None
+ else:
+ try:
+ QtGui.QMessageBox.information(None, "Indexing", "No window will be shown for the next few minutes") # warn that i'm too lazy to bg jobs GUI ;)
+ convert_idx_s.build_sorted_entrylist(outidxname)
+ os.utime(outblockname, None) # change atime to avoid false-positive above
+ os.utime(outidxname, None)
+ except AssertionError, e:
+ QtGui.QMessageBox.critical(None, "Index postprocessing problem", "Sorry, the following error occured:\n%s\nThe program will now continue anyway in slow indexing mode" % unicode(e))
+ outidxname_s = None
+
+ return inputbz2Archive, outidxname, outblockname, outidxname_s
+
+
+
+##
+## Main Wiki-syntax processing functions
+##
+
+def convertWikiList(txtLines):
+ """ Parser for the namedlist/unnamedlist/definition """
+ def indexDiff(a, b):
+ x = 0
+ for c1, c2 in zip(a, b):
+ if c1 == c2:
+ x += 1
+ else:
+ break
+ return x
+
+ out = ""
+ mode = "%s"
+ stack = []
+ c = ""
+ common = 0
+ pattern = re.compile('[*#:;]+')
+ for line in txtLines:
+ linehead = re.match(pattern, line)
+ if linehead:
+ sl = linehead.end()
+ sp = len(stack)
+ common = indexDiff(stack, line)
+ #for common, x in enumerate(zip(stack, line)):
+ # if x[0] != x[1]:
+ # break
+ #else:
+ # common = min(len(stack), len(line))
+ for x in range(sp, common, -1):
+ c = stack.pop()
+ #print "pop", c
+ if c == '*':
+ out += ""
+ if c == '#':
+ out += ""
+ if c == ':':
+ out += ""
+ lastpoped = c
+ if not linehead:
+ break
+ for x in range(common, sl):
+ c = line[x]
+ stack.append(c)
+ #print "push", c
+ if c == '*':
+ out += ""
+ mode = "- %s
"
+ elif c == '#':
+ out += ""
+ mode = "- %s
"
+ elif c == ':' and lastpoped == '*':
+ out += "- "
+ mode = "
- %s
"
+ elif c == ';':
+ k = line.find(':', x+1)
+ if k > -1:
+ head, line = line[x+1:k], line[k+1:]
+ sl = 0
+ else:
+ head, line = line[x+1:], "" # "" will be skipped
+ out += "- %s
- " % head
+ stack[-1] = ':'
+ mode = "%s"
+ k = line[sl:].strip()
+ if k:
+ out += mode % k
+ return out
+
+def filterMarkup(t, mathRendererInstance = None):
+ """Ref: http://meta.wikimedia.org/wiki/Help:Wikitext_reference"""
+ t = re.subn('(?s)', "", t)[0] # force removing comments
+
+ t = re.subn("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$","", t)[0] # force remove last (=languages) list
+
+ def equal2h(m):
+ m = m.groupdict()
+ m['level'] = str(len(m['level']))
+ return "\n%(title)s\n
\n" % m
+ t = re.subn("\n(?P=+) *(?P[^\n]*)\\1 *(?=\n)", equal2h, t )[0]
+
+ if mathRendererInstance: # was set to None if texvc is unavailable
+ t = re.sub("(?s)", mathRendererInstance.parse_exp, t)
+
+ t = re.sub("'''(.+?)'''", "\\1", t)
+ t = re.sub("''(.+?)''", "\\1", t)
+
+ t = re.subn("(?u)^ \t]*==[ \t]*(\w)[ \t]*==[ \t]*\n", '(Image: \\1)
', t)[0]
+ # Instead of trying to implement a recursive parser which i want to avoid as much as possible,
+ # simply do a 2-depth-only substitution with a 2-pass process
+ t = re.subn("{{([^}{]*)}}", macroGeneric, t)[0]
+ t = re.subn("{{([^}]*)}}", macroGeneric, t)[0]
+ # FIXME : those patterns can't recurse (it _is_ REGex after all :-)
+ # Therefore may fail on such cases as : [[Image:plop.png|This is an [[image]] with link]]
+ # I don't think it's a big deal as for now
+
+ t = re.subn("\[\[([^][|:]*)\]\]", '\\1', t)[0]
+ t = re.subn("\[\[([^]|[:]*)\|([^][]*)\]\]", '\\2', t)[0]
+ t = re.subn('\n----', '\n
', t)[0]
+ def img2alt(m):
+ imgname, other = m.groups()
+ alttxt = other[other.rfind('|')+1:]
+ return '(Image: %s, %s)' % (imgname, alttxt)
+ t = re.subn("\[\[[Ii]mage:([^.]*)(.*?)\]\]", img2alt, t)[0] # todo: parser l'interieur
+
+ def wiki2table(match):
+ m = match.groupdict()
+ #print re.split('\n\|-+', m['body'])
+ s = "\n" % m['head']
+ lines = []
+ for T in re.split('\n\|-+', m['body']):
+ if T.startswith('|-'):
+ T = T[2:]
+ T = re.subn('([|!])\\1', '\n\\1', T)[0]
+ def tmp(m):
+ a,b,c = m.groups()
+ if a == '!':
+ return "%s | " % (b[:-1], c)
+ else:
+ return "%s | " % (b[:-1], c)
+ T = re.subn('([|!])((?:[^|\n]+\|)?)([^|\n]*)', tmp, T)[0]
+ lines.append(T)
+ s += "\n" + "
\n".join(lines) + "
\n"
+ s += "
\n"
+ if m['caption']:
+ s += "Table Caption : %s
\n" % m['caption'][2:]
+ return s
+ t = re.subn('\{\|(?P[^!|}]+)(?P(\|\+.*)?)(?P(.*\n)+?)\|\}', wiki2table, t)[0]
+
+ t = re.subn("\n(([#*:;]+[^\n]+\n)+)", lambda m : convertWikiList(m.group().split('\n')[1:]), t)[0]
+
+ t = re.sub("\n\n+", "\n
", t)
+ t = re.sub("(
\n?)+", "\n
", t) # final cleanup :-)
+
+ footnotes = []
+ namedref = {}
+ def footparse(m):
+ op, cl = m.groups()
+ if op.startswith('>'): # regular ref
+ footnotes.append(op[1:])
+ return "[%d]" % len(footnotes)
+ elif op.startswith(' name'): # named ref
+ key = op.lstrip()[op.find('"'):op.find('"',8)]
+ if not namedref.has_key(key):
+ footnotes.append("")
+ namedref[key] = len(footnotes)
+ if cl != '/>':
+ footnotes[namedref[key] - 1] = op[op.find('>')+1:]
+ return "[%d]" % namedref[key]
+ def footwrite(footnotes):
+ if footnotes == []:
+ return ""
+ else:
+ return "
".join(["%d. %s\n" % (i+1, x) for i, x in enumerate(footnotes)])
+ # FIXME : footnotes may not go to the '' tag depending on the macros and languages
+ t = re.subn('(?s)[ ].*?)(]|/>)', footparse, t)[0]
+ t, hasfound = re.subn('', footwrite(footnotes), t)
+ if (not hasfound) and len(footnotes):
+ t = t + "
Notes
" + footwrite(footnotes)
+ return t
+
+###
+### Macro-specific parsing
+###
+# Macro content may introduces "\n" which conflicts with list-parsing code, thus s/\n/ /
+def macroGeneric(mo):
+ argv = mo.group()[2:-2].replace('\n', '').split('|')
+ # Only one macro example implemented so far :-)
+ if argv[0].startswith('formatnum:'):
+ t = argv[0][10:]
+ #elif argv[0].startswith(....): t = ...
+ elif argv[0].startswith('main'):
+ txt = "(main article : [[%s]])" % argv[1]
+ t = '%s
' % txt
+ elif argv[0].startswith('reflist'):
+ t = ""
+ else:
+ txt = " %s %s " % (argv[0], ", ".join(argv[1:]))
+ t = '%s' % txt
+ return t
+
+
+###
+### Init code
+###
+
+def main():
+ app = QtGui.QApplication(sys.argv)
+ w = MainViewer()
+ w.setWindowFlags(Qt.Window)
+ w.show()
+ sys.exit(app.exec_())
+
+try: # Test if i'm running in IPython, for easy debugging.
+ # If so, i'll use the IPython's background Qt-loop feature
+ print _ip
+ print "Running in ipython mode "
+ if QtGui.qApp.instance() == None:
+ app = QtGui.QApplication(sys.argv)
+ w = MainViewer()
+ w.setWindowFlags(Qt.WindowMinMaxButtonsHint)
+ w.show()
+except NameError:
+ main()
diff --git a/PyQT4/QTextBrowser2.py b/PyQT4/QTextBrowser2.py
new file mode 100644
index 0000000..02ae60f
--- /dev/null
+++ b/PyQT4/QTextBrowser2.py
@@ -0,0 +1,6 @@
+import PyQt4.QtGui
+# Overload just the setSource member of QTextBrowser
+# Should only be necessary with Qt < 4.3 (missing Qt4.3's setOpenLinks(False))
+class QTextBrowser2(PyQt4.QtGui.QTextBrowser):
+ def setSource(*args):
+ pass
diff --git a/PyQT4/README b/PyQT4/README
new file mode 100644
index 0000000..0fb21d4
--- /dev/null
+++ b/PyQT4/README
@@ -0,0 +1,90 @@
+A Wikipedia-Dump Reader.
+
+This Reader displays the text-only archives of wikipedia, which can be
+downloaded from :
+ http://download.wikimedia.org/backup-index.html
+and are usually named like :
+ pages-articles.xml.bz2
+
+It requires Python, Qt and PyQt. Altough only Qt4/PyQt4 is supported now, the
+old Qt3/PyQt3 code is still included and should still work.
+It also assumes you have basic tools like gzip, zcat and zgrep, tail, head...
+
+(Optional) You will need the command line applications "texvc" and "latex" in
+order to render math expressions. (texvc is provided with this application)
+
+This reader is not yet complete although fairly useable in its current form.
+
+Usage
+-----
+1. on the commandline, run:
+ python dumpReader.py
+ or just click on it from your favorite file manager
+
+2. Browse and select the archive (some file probably named *.xml.bz2)
+
+3. If it's the first time, an index is created, which can take a lot of time.
+ The english dumps currently need more than an hour. Note that if you
+ abort during the index creation, it will be useable, altough obviously
+ incomplete. (Useful for users who want to quicktest the program ;)
+ Currently, the program need write permission on the same directory.
+
+4. The main windows contains the article title area (top), main text area
+ (left) and article history (right). You can go to an article by typing
+ its name then click the "Go" button, or by clicking a link from the main
+ text area. By default, clicking a link load the article in the background.
+ The search-box area allows to keyword search among the articles' title.
+ You can also go to a random article by clicking "Go" with an empty entry.
+
+* You will need the command line application "Texvc" and in order to
+ render math expressions. This tool requires "Latex". Note that it
+ will use a directory (usually /tmp/wikipediaDumpReader_texvm/) to
+ render the images, which is cleared at the restart of the application.
+
+FAQ
+---
+Q. Can i get my dump quickly up-to-date while i'm online ?
+A. No. As far as i know, there is no way to "update" your currently downloaded
+ xml.bz2 dump to sync it. The only way to get up-to-date is to delete the old
+ dump (and also generated indexes files) and to fully re-download a new one.
+
+Q. I don't like the background-loading behaviour. Can i change it ?
+A. If you want to immediately see the content of clicked links, you have to
+ manually modify the program : Edit the "dumpReader.py" file, go to the line
+ which says "self.loadTabInBackground = True" and change "True" to "False".
+
+Q. Can i disable the graphical rendering of the maths ? ("latex rendering")
+A. Yes, but you will have to manually modify the program : Edit the
+ "dumpReader.py" file, go to the line which says "self.latexRendering = True"
+ and change "True" to "False"
+
+Q. Can i change the text size ?
+A. Font Size can now be changed, altough you will have to manually modify
+ the program : Edit the "dumpReader.py" file, go to the line which says
+ "fontSize = 9" and change "9" to whatever point size fits you best.
+ This will only change the font size of the text area.
+
+Q. Can i edit the User Interface to change more settings ?
+A. If you have the Qt4 "designer" program, shipped with Qt-tools, you
+ can edit "form3.ui" to fit your needs
+
+Q. What is the "debug" button ?
+A. This is needed only for developers. When toggle-on, each newly-loaded
+ article is also copied on the upper area. When pressing "apply regex",
+ it's filtered to the lower area.
+
+Q. The program says : RuntimeWarning: Python C API version mismatch for
+ module bz2: This Python has API version 1013, module bz2 has version 1012.
+A. This can be safely ignored. This occurs because i provides a precompiled
+ binary bz2.so module. You are welcome to recompile your own if you want
+ from the src/ directory. Warning : this is NOT the standard bz2.so python
+ module, it's a static copy with some changes.
+
+Q. How can I delete entries from the dump-selection initial dialog box ?
+A. There is no other way than editing the file ".wikipediadumpreaderrc" from
+ your home directory and removing the lines you don't want. You may need
+ to check "display hidden files" on your file manager to find this file.
+
+--
+Benjamin Thyreau - 7/2009
+wikireader@decideur.info
diff --git a/PyQT4/convert_idx_s.py b/PyQT4/convert_idx_s.py
new file mode 100644
index 0000000..d96aaea
--- /dev/null
+++ b/PyQT4/convert_idx_s.py
@@ -0,0 +1,78 @@
+# Convert unsorted gzipped dump-entries list (ie. first-pass index) to sorted and
+# seekable gzipped entries list. Use the third-party 'zran' program almost unmodified
+
+import os
+from os.path import join as J
+import re, pickle
+import bisect, os
+
+global zranbin
+
+def assert_zran_runtime():
+ global zranbin
+ zranbin = J(os.path.dirname(__file__), './zran_wdr')
+ assert os.path.exists(zranbin), "can't find 'zran_wdr' binary at '%s'" % zranbin
+ assert 'usage' in os.popen(zranbin + ' 2>&1').read(), "unexpected error calling 'zran_wdr'"
+
+def build_sorted_entrylist(zindexfilename):
+ # assert everything is ok before starting
+ assert_zran_runtime()
+ assert zindexfilename.endswith('.idx.gz'), "wrongly named .idx.gz filename"
+ zindexfilename_s = zindexfilename[:-3] + '_s.gz'
+ assert not os.path.exists(zindexfilename_s), "a file named %s already exists" % zindexfilename_s
+ assert 'sorted' in os.popen("LANG=C sort --help").read(), "unexpected error calling 'sort'"
+ assert 'counts' in os.popen("LANG=C wc --help").read(), "unexpected error calling 'wc'"
+ filesize = int(os.stat(zindexfilename)[6]) // 1024
+ tmp_freespace = int(os.popen('/bin/df -P /tmp').readlines()[1].split()[3])
+ assert filesize < tmp_freespace, "not enough space left on /tmp (report %dK, need %sK)" % (tmp_freespace, filesize)
+
+ # Do actual sorting - blocking + slow + i don't think i can monitor progress
+
+ tmpname = os.tmpnam()
+ # it looks that utf8-encoded strings won't work on shell commands after an ">", thus tmpname
+ print "zcat input | LANG=C sort | gzip -c > %s" % tmpname # this print was crashing the app with utf8 args when run from the gnome-panel (?!)
+ os.popen(("zcat %s | LANG=C sort | gzip -c > %s" % (zindexfilename.encode('utf-8'), tmpname)))
+ print "checking"
+ nblines_old = int(os.popen(("zcat %s | wc -l" % zindexfilename).encode('utf-8')).read().strip())
+ nblines_new = int(os.popen("zcat %s | wc -l" % tmpname).read().strip())
+ assert nblines_new == nblines_old, "number of entries don't match"
+ os.popen("/bin/mv -f %s %s" % (tmpname, zindexfilename.encode('utf-8')))
+ print "indexing entrylist"
+ #filesize = int(os.stat(zindexfilename)[6]) / 100
+ bufsize = "409600"
+ cmd = os.popen( zranbin + " %s -i %s -S %s -c 2>&1 | grep zran_index_save_point" % (zindexfilename.encode('utf-8'), zindexfilename_s.encode('utf-8'), bufsize))
+ L = [('', '0', '0')]
+ for l in cmd:
+ r = re.findall('(.*)zran_index_save_point out=(\d+), in=(\d+)_(.*)', l)[0]
+ L.append((r[0]+r[3], r[1], r[2]))
+ #print int(r[2]) // filesize # progress bar
+
+ Ltxt = pickle.dumps(L, protocol=2) # almost __repr__
+
+ # Cat the entrylist tab and its file-offset at the end of the _s file.
+ f = open(zindexfilename_s, 'a')
+ f.seek(0, 2)
+ l=f.tell()
+ length = '0x%08X' % l
+ f.write(Ltxt)
+ f.write(length)
+ f.close()
+ print "Finished"
+
+def load_entrylist_table(zindexfilename_s):
+ try:
+ assert_zran_runtime()
+ except AssertionError:
+ return None
+ f = open(zindexfilename_s)
+ f.seek(-10, 2)
+ f.seek(eval(f.read(10)))
+ idx_s = pickle.loads(f.read()[:-10])
+ return idx_s
+
+def load_entry_addr(entry, idx_s, zindexfilename): # fixme entry & filename must be already utf8-decoded
+ global zranbin
+ zindexfilename_s = zindexfilename[:-3] + '_s.gz'
+ i=bisect.bisect(idx_s, (entry,))
+ return i != len(idx_s) and os.popen(zranbin + ' %s -i %s %s -s %d | grep "^%s\t"' % (zindexfilename, zindexfilename_s, idx_s[i-1][1], int(idx_s[i][1]) - int(idx_s[i-1][1]) + 255, entry) ).read()
+
diff --git a/PyQT4/form3.ui b/PyQT4/form3.ui
new file mode 100644
index 0000000..87bddb1
--- /dev/null
+++ b/PyQT4/form3.ui
@@ -0,0 +1,356 @@
+
+ Form1
+
+
+
+ 0
+ 0
+ 648
+ 515
+
+
+
+ தமிழா கார்த்திகா 0.3: தமிழ்-ஆங்கிலம் அகராதி
+
+
+
+ 9
+
+
+ 6
+
+ -
+
+
+ apply python regex
+
+
+
+ -
+
+
+ debug mode
+
+
+ true
+
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+
+ 40
+ 5
+
+
+
+
+ -
+
+
+ 0
+
+
+ 6
+
+
-
+
+
+
+ 75
+ true
+
+
+
+ textLabel1
+
+
+ Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter
+
+
+ false
+
+
+ 2
+
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+ QSizePolicy::Expanding
+
+
+
+ 101
+ 20
+
+
+
+
+ -
+
+
+
+ 5
+ 5
+ 0
+ 0
+
+
+
+ close
+
+
+
+
+
+ -
+
+
+ -
+
+
+
+ -
+
+
+
+ 4
+ 5
+ 0
+ 0
+
+
+
+
+ 100
+ 0
+
+
+
+
+ 150
+ 16777215
+
+
+
+
+ -
+
+
+ 0
+
+
+ 6
+
+
-
+
+
+
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ 28
+ 16777215
+
+
+
+ x
+
+
+
+ -
+
+
+ -
+
+
+ செல்
+
+
+
+ -
+
+
+ தேடுக
+
+
+
+ -
+
+
+ searchbox
+
+
+ true
+
+
+
+
+
+ -
+
+
+
+ 200
+ 2000
+
+
+
+ true
+
+
+ QAbstractItemView::SelectRows
+
+
+
+ -
+
+
+ உதவி
+
+
+
+ -
+
+
+
+
+
+ qPixmapFromMimeSource
+
+
+ QTextBrowser2
+ QTextBrowser
+
+
+
+
+ pushButton4
+ lineEdit1
+ pushButton1
+ pushButton3
+ listBox1
+ checkBox3
+ textEdit1
+ pushButton9
+ listBox3
+ checkBox1
+ textEdit2
+ textEdit3
+ pushButton2
+ pushButton3
+
+
+
+
+ checkBox1
+ toggled(bool)
+ textEdit3
+ setShown(bool)
+
+
+ 29
+ 501
+
+
+ 33
+ 393
+
+
+
+
+ checkBox1
+ toggled(bool)
+ textEdit2
+ setShown(bool)
+
+
+ 29
+ 501
+
+
+ 33
+ 195
+
+
+
+
+ pushButton4
+ clicked()
+ lineEdit1
+ clear()
+
+
+ 20
+ 20
+
+
+ 113
+ 33
+
+
+
+
+ checkBox3
+ toggled(bool)
+ pushButton3
+ setShown(bool)
+
+
+ 578
+ 32
+
+
+ 497
+ 30
+
+
+
+
+ checkBox3
+ toggled(bool)
+ listBox1
+ setShown(bool)
+
+
+ 578
+ 32
+
+
+ 31
+ 65
+
+
+
+
+ checkBox1
+ toggled(bool)
+ pushButton2
+ setShown(bool)
+
+
+ 56
+ 489
+
+
+ 482
+ 493
+
+
+
+
+
diff --git a/PyQT4/form3_smaller.ui b/PyQT4/form3_smaller.ui
new file mode 100644
index 0000000..40e2a6e
--- /dev/null
+++ b/PyQT4/form3_smaller.ui
@@ -0,0 +1,218 @@
+
+ Form1
+
+
+
+ 0
+ 0
+ 648
+ 515
+
+
+
+ Wikipedia dump reader
+
+
+ -
+
+
+ 6
+
+
+ 0
+
+
+ 0
+
+
+ 0
+
+
+ 0
+
+
-
+
+
+
+ 0
+ 0
+
+
+
+
+ 28
+ 16777215
+
+
+
+ x
+
+
+
+ -
+
+
+ Article or Search entry
+
+
+
+ -
+
+
+ Go
+
+
+
+ -
+
+
+ search
+
+
+
+ -
+
+
+ searchbox
+
+
+ true
+
+
+
+
+
+ -
+
+
+
+ 32767
+ 110
+
+
+
+ true
+
+
+ QAbstractItemView::SelectRows
+
+
+
+ -
+
+
+ -
+
+
-
+
+
+
+ 0
+ 0
+
+
+
+ close current article
+
+
+ close
+
+
+
+ -
+
+
+
+ 0
+ 0
+
+
+
+
+ 100
+ 0
+
+
+
+
+ 150
+ 16777215
+
+
+
+
+
+
+
+
+
+ qPixmapFromMimeSource
+
+
+ QTextBrowser2
+ QTextBrowser
+
+
+
+
+ pushButton4
+ lineEdit1
+ pushButton1
+ pushButton3
+ listBox1
+ checkBox3
+ textEdit1
+ listBox3
+
+
+
+
+ pushButton4
+ clicked()
+ lineEdit1
+ clear()
+
+
+ 20
+ 20
+
+
+ 113
+ 33
+
+
+
+
+ checkBox3
+ toggled(bool)
+ pushButton3
+ setShown(bool)
+
+
+ 578
+ 32
+
+
+ 497
+ 30
+
+
+
+
+ checkBox3
+ toggled(bool)
+ listBox1
+ setShown(bool)
+
+
+ 578
+ 32
+
+
+ 31
+ 65
+
+
+
+
+
diff --git a/PyQT4/indexdir/MAIN_WRITELOCK b/PyQT4/indexdir/MAIN_WRITELOCK
new file mode 100644
index 0000000..e69de29
diff --git a/PyQT4/indexdir/_MAIN_1.fln b/PyQT4/indexdir/_MAIN_1.fln
new file mode 100644
index 0000000..1a1c6aa
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_1.fln differ
diff --git a/PyQT4/indexdir/_MAIN_1.pst b/PyQT4/indexdir/_MAIN_1.pst
new file mode 100644
index 0000000..fe60717
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_1.pst differ
diff --git a/PyQT4/indexdir/_MAIN_1.sto b/PyQT4/indexdir/_MAIN_1.sto
new file mode 100644
index 0000000..c39a1a5
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_1.sto differ
diff --git a/PyQT4/indexdir/_MAIN_1.trm b/PyQT4/indexdir/_MAIN_1.trm
new file mode 100644
index 0000000..640cdb4
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_1.trm differ
diff --git a/PyQT4/indexdir/_MAIN_10.fln b/PyQT4/indexdir/_MAIN_10.fln
new file mode 100644
index 0000000..05ce562
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_10.fln differ
diff --git a/PyQT4/indexdir/_MAIN_10.pst b/PyQT4/indexdir/_MAIN_10.pst
new file mode 100644
index 0000000..5213536
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_10.pst differ
diff --git a/PyQT4/indexdir/_MAIN_10.sto b/PyQT4/indexdir/_MAIN_10.sto
new file mode 100644
index 0000000..ad19d9e
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_10.sto differ
diff --git a/PyQT4/indexdir/_MAIN_10.trm b/PyQT4/indexdir/_MAIN_10.trm
new file mode 100644
index 0000000..b9b9054
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_10.trm differ
diff --git a/PyQT4/indexdir/_MAIN_11.fln b/PyQT4/indexdir/_MAIN_11.fln
new file mode 100644
index 0000000..9b43dfe
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_11.fln differ
diff --git a/PyQT4/indexdir/_MAIN_11.pst b/PyQT4/indexdir/_MAIN_11.pst
new file mode 100644
index 0000000..024eb67
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_11.pst differ
diff --git a/PyQT4/indexdir/_MAIN_11.sto b/PyQT4/indexdir/_MAIN_11.sto
new file mode 100644
index 0000000..1fdf9e8
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_11.sto differ
diff --git a/PyQT4/indexdir/_MAIN_11.trm b/PyQT4/indexdir/_MAIN_11.trm
new file mode 100644
index 0000000..5e386df
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_11.trm differ
diff --git a/PyQT4/indexdir/_MAIN_12.fln b/PyQT4/indexdir/_MAIN_12.fln
new file mode 100644
index 0000000..0109d2e
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_12.fln differ
diff --git a/PyQT4/indexdir/_MAIN_12.pst b/PyQT4/indexdir/_MAIN_12.pst
new file mode 100644
index 0000000..bed333e
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_12.pst differ
diff --git a/PyQT4/indexdir/_MAIN_12.sto b/PyQT4/indexdir/_MAIN_12.sto
new file mode 100644
index 0000000..e9135a5
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_12.sto differ
diff --git a/PyQT4/indexdir/_MAIN_12.trm b/PyQT4/indexdir/_MAIN_12.trm
new file mode 100644
index 0000000..230b4a2
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_12.trm differ
diff --git a/PyQT4/indexdir/_MAIN_13.fln b/PyQT4/indexdir/_MAIN_13.fln
new file mode 100644
index 0000000..17be8f6
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_13.fln differ
diff --git a/PyQT4/indexdir/_MAIN_13.pst b/PyQT4/indexdir/_MAIN_13.pst
new file mode 100644
index 0000000..4b05556
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_13.pst differ
diff --git a/PyQT4/indexdir/_MAIN_13.sto b/PyQT4/indexdir/_MAIN_13.sto
new file mode 100644
index 0000000..a584eb5
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_13.sto differ
diff --git a/PyQT4/indexdir/_MAIN_13.trm b/PyQT4/indexdir/_MAIN_13.trm
new file mode 100644
index 0000000..9cf03e6
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_13.trm differ
diff --git a/PyQT4/indexdir/_MAIN_14.fln b/PyQT4/indexdir/_MAIN_14.fln
new file mode 100644
index 0000000..7384137
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_14.fln differ
diff --git a/PyQT4/indexdir/_MAIN_14.pst b/PyQT4/indexdir/_MAIN_14.pst
new file mode 100644
index 0000000..e35d249
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_14.pst differ
diff --git a/PyQT4/indexdir/_MAIN_14.sto b/PyQT4/indexdir/_MAIN_14.sto
new file mode 100644
index 0000000..f9f1d16
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_14.sto differ
diff --git a/PyQT4/indexdir/_MAIN_14.trm b/PyQT4/indexdir/_MAIN_14.trm
new file mode 100644
index 0000000..99de261
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_14.trm differ
diff --git a/PyQT4/indexdir/_MAIN_15.fln b/PyQT4/indexdir/_MAIN_15.fln
new file mode 100644
index 0000000..e7c0999
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_15.fln differ
diff --git a/PyQT4/indexdir/_MAIN_15.pst b/PyQT4/indexdir/_MAIN_15.pst
new file mode 100644
index 0000000..ef93eca
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_15.pst differ
diff --git a/PyQT4/indexdir/_MAIN_15.sto b/PyQT4/indexdir/_MAIN_15.sto
new file mode 100644
index 0000000..8d9ed6a
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_15.sto differ
diff --git a/PyQT4/indexdir/_MAIN_15.trm b/PyQT4/indexdir/_MAIN_15.trm
new file mode 100644
index 0000000..f802aa0
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_15.trm differ
diff --git a/PyQT4/indexdir/_MAIN_16.fln b/PyQT4/indexdir/_MAIN_16.fln
new file mode 100644
index 0000000..dbff982
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_16.fln differ
diff --git a/PyQT4/indexdir/_MAIN_16.pst b/PyQT4/indexdir/_MAIN_16.pst
new file mode 100644
index 0000000..6b25723
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_16.pst differ
diff --git a/PyQT4/indexdir/_MAIN_16.sto b/PyQT4/indexdir/_MAIN_16.sto
new file mode 100644
index 0000000..9bc9c62
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_16.sto differ
diff --git a/PyQT4/indexdir/_MAIN_16.trm b/PyQT4/indexdir/_MAIN_16.trm
new file mode 100644
index 0000000..d0cd03c
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_16.trm differ
diff --git a/PyQT4/indexdir/_MAIN_17.fln b/PyQT4/indexdir/_MAIN_17.fln
new file mode 100644
index 0000000..f35a77b
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_17.fln differ
diff --git a/PyQT4/indexdir/_MAIN_17.pst b/PyQT4/indexdir/_MAIN_17.pst
new file mode 100644
index 0000000..77b92b0
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_17.pst differ
diff --git a/PyQT4/indexdir/_MAIN_17.sto b/PyQT4/indexdir/_MAIN_17.sto
new file mode 100644
index 0000000..6cc9d76
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_17.sto differ
diff --git a/PyQT4/indexdir/_MAIN_17.trm b/PyQT4/indexdir/_MAIN_17.trm
new file mode 100644
index 0000000..cd35490
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_17.trm differ
diff --git a/PyQT4/indexdir/_MAIN_2.fln b/PyQT4/indexdir/_MAIN_2.fln
new file mode 100644
index 0000000..33da396
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_2.fln differ
diff --git a/PyQT4/indexdir/_MAIN_2.pst b/PyQT4/indexdir/_MAIN_2.pst
new file mode 100644
index 0000000..ed0ec61
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_2.pst differ
diff --git a/PyQT4/indexdir/_MAIN_2.sto b/PyQT4/indexdir/_MAIN_2.sto
new file mode 100644
index 0000000..711ffca
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_2.sto differ
diff --git a/PyQT4/indexdir/_MAIN_2.trm b/PyQT4/indexdir/_MAIN_2.trm
new file mode 100644
index 0000000..453ee64
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_2.trm differ
diff --git a/PyQT4/indexdir/_MAIN_29.fln b/PyQT4/indexdir/_MAIN_29.fln
new file mode 100644
index 0000000..253c03d
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_29.fln differ
diff --git a/PyQT4/indexdir/_MAIN_29.pst b/PyQT4/indexdir/_MAIN_29.pst
new file mode 100644
index 0000000..75b1b03
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_29.pst differ
diff --git a/PyQT4/indexdir/_MAIN_29.sto b/PyQT4/indexdir/_MAIN_29.sto
new file mode 100644
index 0000000..07f991d
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_29.sto differ
diff --git a/PyQT4/indexdir/_MAIN_29.trm b/PyQT4/indexdir/_MAIN_29.trm
new file mode 100644
index 0000000..49ad708
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_29.trm differ
diff --git a/PyQT4/indexdir/_MAIN_3.fln b/PyQT4/indexdir/_MAIN_3.fln
new file mode 100644
index 0000000..f640f53
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_3.fln differ
diff --git a/PyQT4/indexdir/_MAIN_3.pst b/PyQT4/indexdir/_MAIN_3.pst
new file mode 100644
index 0000000..84cec97
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_3.pst differ
diff --git a/PyQT4/indexdir/_MAIN_3.sto b/PyQT4/indexdir/_MAIN_3.sto
new file mode 100644
index 0000000..b7577b8
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_3.sto differ
diff --git a/PyQT4/indexdir/_MAIN_3.trm b/PyQT4/indexdir/_MAIN_3.trm
new file mode 100644
index 0000000..0aaaa15
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_3.trm differ
diff --git a/PyQT4/indexdir/_MAIN_4.fln b/PyQT4/indexdir/_MAIN_4.fln
new file mode 100644
index 0000000..86c8f9f
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_4.fln differ
diff --git a/PyQT4/indexdir/_MAIN_4.pst b/PyQT4/indexdir/_MAIN_4.pst
new file mode 100644
index 0000000..57b7d23
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_4.pst differ
diff --git a/PyQT4/indexdir/_MAIN_4.sto b/PyQT4/indexdir/_MAIN_4.sto
new file mode 100644
index 0000000..3247a92
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_4.sto differ
diff --git a/PyQT4/indexdir/_MAIN_4.trm b/PyQT4/indexdir/_MAIN_4.trm
new file mode 100644
index 0000000..4642c78
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_4.trm differ
diff --git a/PyQT4/indexdir/_MAIN_5.fln b/PyQT4/indexdir/_MAIN_5.fln
new file mode 100644
index 0000000..9153dbd
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_5.fln differ
diff --git a/PyQT4/indexdir/_MAIN_5.pst b/PyQT4/indexdir/_MAIN_5.pst
new file mode 100644
index 0000000..479aa32
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_5.pst differ
diff --git a/PyQT4/indexdir/_MAIN_5.sto b/PyQT4/indexdir/_MAIN_5.sto
new file mode 100644
index 0000000..b18d90a
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_5.sto differ
diff --git a/PyQT4/indexdir/_MAIN_5.trm b/PyQT4/indexdir/_MAIN_5.trm
new file mode 100644
index 0000000..048e6f8
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_5.trm differ
diff --git a/PyQT4/indexdir/_MAIN_6.fln b/PyQT4/indexdir/_MAIN_6.fln
new file mode 100644
index 0000000..2a04942
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_6.fln differ
diff --git a/PyQT4/indexdir/_MAIN_6.pst b/PyQT4/indexdir/_MAIN_6.pst
new file mode 100644
index 0000000..a84ba2a
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_6.pst differ
diff --git a/PyQT4/indexdir/_MAIN_6.sto b/PyQT4/indexdir/_MAIN_6.sto
new file mode 100644
index 0000000..363105d
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_6.sto differ
diff --git a/PyQT4/indexdir/_MAIN_6.trm b/PyQT4/indexdir/_MAIN_6.trm
new file mode 100644
index 0000000..606a92a
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_6.trm differ
diff --git a/PyQT4/indexdir/_MAIN_62.fln b/PyQT4/indexdir/_MAIN_62.fln
new file mode 100644
index 0000000..644345e
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_62.fln differ
diff --git a/PyQT4/indexdir/_MAIN_62.pst b/PyQT4/indexdir/_MAIN_62.pst
new file mode 100644
index 0000000..62d48d9
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_62.pst differ
diff --git a/PyQT4/indexdir/_MAIN_62.sto b/PyQT4/indexdir/_MAIN_62.sto
new file mode 100644
index 0000000..b540cbe
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_62.sto differ
diff --git a/PyQT4/indexdir/_MAIN_62.trm b/PyQT4/indexdir/_MAIN_62.trm
new file mode 100644
index 0000000..db2967f
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_62.trm differ
diff --git a/PyQT4/indexdir/_MAIN_7.fln b/PyQT4/indexdir/_MAIN_7.fln
new file mode 100644
index 0000000..435f0de
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_7.fln differ
diff --git a/PyQT4/indexdir/_MAIN_7.pst b/PyQT4/indexdir/_MAIN_7.pst
new file mode 100644
index 0000000..99f9ddc
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_7.pst differ
diff --git a/PyQT4/indexdir/_MAIN_7.sto b/PyQT4/indexdir/_MAIN_7.sto
new file mode 100644
index 0000000..3df62ee
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_7.sto differ
diff --git a/PyQT4/indexdir/_MAIN_7.trm b/PyQT4/indexdir/_MAIN_7.trm
new file mode 100644
index 0000000..e2fd2eb
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_7.trm differ
diff --git a/PyQT4/indexdir/_MAIN_76.fln b/PyQT4/indexdir/_MAIN_76.fln
new file mode 100644
index 0000000..d8dfccf
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_76.fln differ
diff --git a/PyQT4/indexdir/_MAIN_76.pst b/PyQT4/indexdir/_MAIN_76.pst
new file mode 100644
index 0000000..542f2ba
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_76.pst differ
diff --git a/PyQT4/indexdir/_MAIN_76.sto b/PyQT4/indexdir/_MAIN_76.sto
new file mode 100644
index 0000000..10905f8
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_76.sto differ
diff --git a/PyQT4/indexdir/_MAIN_76.toc b/PyQT4/indexdir/_MAIN_76.toc
new file mode 100644
index 0000000..f6e41f5
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_76.toc differ
diff --git a/PyQT4/indexdir/_MAIN_76.trm b/PyQT4/indexdir/_MAIN_76.trm
new file mode 100644
index 0000000..a1d7987
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_76.trm differ
diff --git a/PyQT4/indexdir/_MAIN_77.fln b/PyQT4/indexdir/_MAIN_77.fln
new file mode 100644
index 0000000..e69de29
diff --git a/PyQT4/indexdir/_MAIN_77.pst b/PyQT4/indexdir/_MAIN_77.pst
new file mode 100644
index 0000000..e69de29
diff --git a/PyQT4/indexdir/_MAIN_77.sto b/PyQT4/indexdir/_MAIN_77.sto
new file mode 100644
index 0000000..ce58bc9
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_77.sto differ
diff --git a/PyQT4/indexdir/_MAIN_77.trm b/PyQT4/indexdir/_MAIN_77.trm
new file mode 100644
index 0000000..0edd2d9
--- /dev/null
+++ b/PyQT4/indexdir/_MAIN_77.trm
@@ -0,0 +1 @@
+HASH
\ No newline at end of file
diff --git a/PyQT4/indexdir/_MAIN_8.fln b/PyQT4/indexdir/_MAIN_8.fln
new file mode 100644
index 0000000..e5684a2
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_8.fln differ
diff --git a/PyQT4/indexdir/_MAIN_8.pst b/PyQT4/indexdir/_MAIN_8.pst
new file mode 100644
index 0000000..10d6303
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_8.pst differ
diff --git a/PyQT4/indexdir/_MAIN_8.sto b/PyQT4/indexdir/_MAIN_8.sto
new file mode 100644
index 0000000..1d77573
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_8.sto differ
diff --git a/PyQT4/indexdir/_MAIN_8.trm b/PyQT4/indexdir/_MAIN_8.trm
new file mode 100644
index 0000000..7bd9fd7
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_8.trm differ
diff --git a/PyQT4/indexdir/_MAIN_9.fln b/PyQT4/indexdir/_MAIN_9.fln
new file mode 100644
index 0000000..a5a384b
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_9.fln differ
diff --git a/PyQT4/indexdir/_MAIN_9.pst b/PyQT4/indexdir/_MAIN_9.pst
new file mode 100644
index 0000000..614f011
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_9.pst differ
diff --git a/PyQT4/indexdir/_MAIN_9.sto b/PyQT4/indexdir/_MAIN_9.sto
new file mode 100644
index 0000000..792d1ea
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_9.sto differ
diff --git a/PyQT4/indexdir/_MAIN_9.trm b/PyQT4/indexdir/_MAIN_9.trm
new file mode 100644
index 0000000..91837f5
Binary files /dev/null and b/PyQT4/indexdir/_MAIN_9.trm differ
diff --git a/PyQT4/listWidget.py b/PyQT4/listWidget.py
new file mode 100644
index 0000000..1662e9c
--- /dev/null
+++ b/PyQT4/listWidget.py
@@ -0,0 +1,30 @@
+import sys
+from PyQt4.QtGui import QApplication, QWidget, QListWidget, QListWidgetItem, QHBoxLayout
+from PyQt4 import QtCore, QtGui
+from PyQt4.QtCore import Qt
+
+class ListWindow(QWidget):
+ def __init__(self, parent=None):
+ super(ListWindow, self).__init__(parent)
+ self.listWidget = QListWidget()
+ for i in range(1, 11):
+ self.listWidget.addItem("Item {}".format(i))
+ item1 = QListWidgetItem('Text', self.listWidget)
+ item1.setData(Qt.UserRole, 'chunk-124.xml.bz2')
+ # self.listWidget.addItem(item1)
+ self.listWidget.itemActivated.connect(self.printItemText)
+ mainLayout = QHBoxLayout()
+ mainLayout.addWidget(self.listWidget)
+ self.setLayout(mainLayout)
+
+ def printItemText(self, item):
+ """These two are equivalent"""
+ w = self.listWidget.currentItem()
+ print(w.text())
+ print(w.data(Qt.UserRole).toPyObject())
+
+if __name__ == "__main__":
+ app = QApplication(sys.argv)
+ listWindow = ListWindow()
+ listWindow.show()
+ app.exec_()
diff --git a/PyQT4/loader.ui b/PyQT4/loader.ui
new file mode 100644
index 0000000..5d57d76
--- /dev/null
+++ b/PyQT4/loader.ui
@@ -0,0 +1,142 @@
+
+ Dialog
+
+
+
+ 0
+ 0
+ 580
+ 373
+
+
+
+ Select a dump file
+
+
+
+ 9
+
+
+ 6
+
+ -
+
+
+ 0
+
+
+ 6
+
+
-
+
+
+ <html><head><meta name="qrichtext" content="1" /><style type="text/css">
+p, li { white-space: pre-wrap; }
+</style></head><body style=" font-family:'Sans Serif'; font-size:9pt; font-weight:400; font-style:normal;">
+<p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;"><span style=" font-size:14pt;">Wikipedia Dump Reader</span></p>
+<p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px; font-size:14pt;"><span style=" font-size:9pt;">select a dump file</span></p></body></html>
+
+
+ Qt::AlignCenter
+
+
+
+ -
+
+
+
+ 0
+ 0
+ 0
+ 0
+
+
+
+ Recent files :
+
+
+
+ -
+
+
+ -
+
+
+ Qt::Horizontal
+
+
+ QDialogButtonBox::Cancel|QDialogButtonBox::NoButton|QDialogButtonBox::Ok|QDialogButtonBox::Open
+
+
+ true
+
+
+
+
+
+ -
+
+
+
+ 5
+ 0
+ 0
+ 0
+
+
+
+ <html><head><meta name="qrichtext" content="1" /><style type="text/css">
+p, li { white-space: pre-wrap; }
+</style></head><body style=" font-family:'Sans Serif'; font-size:9pt; font-weight:400; font-style:normal;">
+<p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;"><span style=" font-size:11pt; font-style:italic;">First Time User ?</span></p>
+<p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px; font-size:11pt; font-style:italic;"><span style=" font-size:9pt; font-style:normal;">You can download a dump for your language from the wikipedia web server at :</span></p>
+ <p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;"><a href="http://download.wikimedia.org/backup-index.html"><span style=" text-decoration: underline; color:#0000ff;">http://download.wikimedia.org/backup-index.html</span></a></p>
+ <p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;">They are files generally named such as pages-articles.xml.bz2</p><p style=" margin-top:0px; margin-bottom:0px; margin-left:0px; margin-right:0px; -qt-block-indent:0; text-indent:0px;">for example, a french dump may be named frwiki-20080514-pages-articles.xml.bz2</p><
+ </body></html>
+
+
+ Qt::AlignLeading|Qt::AlignLeft|Qt::AlignVCenter
+
+
+ true
+
+
+
+
+
+
+
+
+ buttonBox
+ accepted()
+ Dialog
+ accept()
+
+
+ 248
+ 254
+
+
+ 157
+ 274
+
+
+
+
+ buttonBox
+ rejected()
+ Dialog
+ reject()
+
+
+ 316
+ 260
+
+
+ 286
+ 274
+
+
+
+
+
diff --git a/PyQT4/mathexp.py b/PyQT4/mathexp.py
new file mode 100644
index 0000000..37c3513
--- /dev/null
+++ b/PyQT4/mathexp.py
@@ -0,0 +1,69 @@
+import commands
+import tempfile
+import os
+#import md5
+from os.path import join as J
+
+class MathExp(object):
+ """Math parser superclass.
+ Every parser object must implement a parse method that takes a latex string
+ and returns an expression that can be displayed withing QTextBrowser.
+ """
+ def __init__(self, keepMathImageFiles):
+ #math_parser = config.math_parser
+ #self.math_parser = globals()[math_parser]()
+ self.math_parser = Texvc(keepMathImageFiles)
+ def parse_exp(self, m):
+ math_reg_exp = m.group(1)
+ try:
+ output = self.math_parser.parse(math_reg_exp)
+ except:
+ output = '(Failed to render) %s
' % math_reg_exp
+ return output
+
+class Texvc(object):
+
+ def __init__(self, keepMathImageFiles = False):
+ self.texvc = J(os.path.dirname(__file__), 'texvc')
+ if not os.path.exists(self.texvc):
+ self.texvc = "./texvc"
+ # Test the program
+ if commands.getstatusoutput(self.texvc)[0] != 0:
+ self.texvc = "texvc"
+ if commands.getstatusoutput(self.texvc)[0] != 0:
+ raise NameError
+ self.images_path = J(tempfile.gettempdir(), 'wikipediaDumpReader_texvm/')
+ if os.path.exists(self.images_path):
+ if not keepMathImageFiles:
+ files_to_remove = os.listdir(self.images_path)
+ for f in files_to_remove:
+ os.remove(self.images_path + f)
+ else:
+ os.mkdir(self.images_path)
+ # test again (paranoid mode ;-)
+ test = self.parse("test")
+ if not os.path.exists(test[26:-3]):
+ raise NameError
+ if keepMathImageFiles:
+ print "Info : maths images are generated in %s and are not deleted after exiting" % self.images_path
+
+ def parse(self, m):
+ # tried to implement a cache system
+ # texvc seems to change the string on which it computes the md5 (eg. a^{1} => a^{{1}})
+ #prehash = md5.md5(m.replace('{', '{{').replace('}','}}')).hexdigest() # bug when frac{{a}}
+ #prehash = md5.md5(m).hexdigest()
+ #print "\n"
+ #print m
+ #print prehash
+ #if not os.path.exists("%s%s.png" % (self.images_path, prehash)):
+ if 1:
+ print "Rendering mathematics, please wait slightly"
+ cmd = self.texvc + " " + self.images_path + " "+ self.images_path + " '" + m + "' utf-8" + " 2>/dev/null"
+ status, output = commands.getstatusoutput(cmd)
+ prehash = output[1:33]
+ return '
' % (self.images_path, prehash) # md5 is 32char, first char is texvm's output code
+ #temp_hash = str.rindex(output, "\n") + 1
+ #texvc_flag = output[temp_hash]
+ #texvc_hash = output[temp_hash + 1 : temp_hash + 33]
+ #parsed_math = '
' % (self.images_path, texvc_hash)
+ #return parsed_math
diff --git a/PyQT4/mparser.py b/PyQT4/mparser.py
new file mode 100644
index 0000000..ca97161
--- /dev/null
+++ b/PyQT4/mparser.py
@@ -0,0 +1,80 @@
+try:
+ # import mod.bz2 as bz2
+ import bz2
+except ImportError:
+ import mod64.bz2 as bz2
+import gzip
+import sys
+import codecs
+import struct
+from xml.dom.minidom import parseString
+
+def buildIndex(inputbz2Archive, outidxname, outblockname, callback = None):
+ # 1. gzip force ascii handlers, dunno how to fix, 2. gzip.open refuses utf8 names
+ zindexfile = gzip.GzipFile(fileobj=open(outidxname, 'w'), filename="")
+ blocksfile = open(outblockname, 'w')
+ f = bz2.BZ2File(inputbz2Archive)
+ #print f.tellbzblock()
+ #f.readline()
+ #print f.tellbzblock()
+ if f.readline()[0:10] != "\n":
+ start = f.tell()
+ break
+ if l == "":
+ raise StopIteration
+
+ #print "article found at (unziped) offset : ", start
+
+ while 1:
+ l = f.readline()
+ bzblocks2 = f.tellbzblock()
+ if bzblocks2[0] != blocknum:
+ blocknum = writeBlock(bzblocks2)
+ print numarticles, " articles found"
+ if l[0:11] == ' ':
+ titleline = l
+ if l == "
\n":
+ ending = f.tell()
+ break
+ if l == "":
+ raise StopIteration
+
+ D = parseString(titleline)
+ n = D.getElementsByTagName('title')
+ title = n[0].firstChild.nodeValue
+ numarticles += 1
+
+ zindexfile.write( ("%s\t%s\t%d\t%d\n" % (title, bzblocks1[0], start - (bzblocks1[1][0] + bzblocks1[1][1] * maxL), ending - start) ).encode('utf-8') )
+ except StopIteration:
+ print "End of that block : ", `f.name`
+
+if __name__ == '__main__':
+ filename = sys.argv[1]
+ if filename.endswith('.xml.bz2'):
+ outidxname = filename[:-8] + '.idx.gz'
+ outblockname = filename[:-8] + '.blocks.idx'
+ buildIndex(sys.argv[1], outidxname, outblockname)
+ else:
+ print "first argument should be a wikipedia .xml.bz2 file"
diff --git a/PyQT4/searcher.py b/PyQT4/searcher.py
new file mode 100644
index 0000000..88cc5e9
--- /dev/null
+++ b/PyQT4/searcher.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# -*- encoding: UTF-8 -*-
+
+'''
+The text entered during the searching should be parsed into search objects for
+the Whoosh library.
+
+This file recieves the text entered and parses into searchable objects and
+performs search operations.
+'''
+
+import re
+import bz2
+import os
+
+import BeautifulSoup
+from whoosh import index
+from whoosh.fields import *
+from whoosh.qparser import QueryParser
+from xml.dom.minidom import parseString
+
+def search_for(text):
+ ''' This function gets the search query string and returns the list of
+ dictionary of the hits (file_name and titles) '''
+ # ix = index.open_dir("C:/Sites\\wikipediaDumpReader\\indexdir\\")
+ ix = index.open_dir(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), "indexdir"))
+ res = []
+ with ix.searcher() as searcher:
+ query = QueryParser("word", ix.schema).parse(unicode(text))
+ results = searcher.search(query, limit=None)
+ for result in results:
+ temp = {}
+ temp['meaning'] = result['meaning']
+ temp['word'] = result['word']
+ res.append(temp)
+ return res
+
+def get_markup(word,meaning):
+ ''' The get_markup function checks wether the index contains full wiki text
+ or the name of the file which contains wiki text and returns the wiki text
+ in first case. It obtains the wikitext from the file and returns it in
+ second case.'''
+ # print word,meaning
+ filexp = re.compile("chunk-[0-9]{1,}.xml.bz2")
+ if filexp.match(meaning):
+ #parse the file and return wiki text
+ # print "Contains File Name"
+ # bzfile = bz2.BZ2File(os.path.join("C:/Sites\\wikipediaDumpReader\\chunks\\",meaning))
+ bzfile = bz2.BZ2File(os.path.join(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), "chunks"),meaning))
+ xmltext = ''
+ writ = False
+ for li in bzfile:
+ line = unicode(li, 'utf-8')
+ if word in line:
+ writ = True
+ if writ:
+ xmltext += line
+ if '' in line:
+ writ = False
+ soup = BeautifulSoup.BeautifulSoup(xmltext)
+ text = soup.find("text").text
+ # text = "மீடியாவிக்கி:Watchlistஎன் கவனிப்புப் பட்டியல்"
+ # ExpatError: mismatched tag (with the xmltext)
+ # UnicodeDecodeError: 'ascii' codec can't decode byte (with the hard-coded text)
+ # D = parseString(xmltext.encode('utf8'))
+ # n = D.getElementsByTagName('text')
+ # text = n[0].firstChild.nodeValue
+ return text
+ else:
+ return meaning
+
+
+if __name__ == "__main__":
+ searchterm = raw_input("Enter the Search Term: ")
+ r = search_for(searchterm)
+ for rs in r:
+ print str(r.index(rs)),unicode(rs['word'])
+ choice = int(raw_input('Enter your option: '))
+ opt = r[choice]
+ print get_markup(opt['word'],opt['meaning'])
+
+
+
diff --git a/PyQT4/setup.py b/PyQT4/setup.py
new file mode 100644
index 0000000..bef9d34
--- /dev/null
+++ b/PyQT4/setup.py
@@ -0,0 +1,29 @@
+from distutils.core import setup
+import py2exe
+
+includes = []
+excludes = ['_gtkagg', '_tkagg', 'bsddb', 'curses', 'email', 'pywin.debugger',
+ 'pywin.debugger.dbgcon', 'pywin.dialogs', 'tcl',
+ 'Tkconstants', 'Tkinter']
+packages = []
+dll_excludes = ['libgdk-win32-2.0-0.dll', 'libgobject-2.0-0.dll', 'tcl84.dll',
+ 'tk84.dll']
+
+setup(
+ options = {"py2exe": {"compressed": 2,
+ "optimize": 2,
+ "includes": includes,
+ "excludes": excludes,
+ "packages": packages,
+ "dll_excludes": dll_excludes,
+ "bundle_files": 1,
+ "dist_dir": "dist",
+ "xref": False,
+ "skip_archive": False,
+ "ascii": False,
+ "custom_boot_script": '',
+ }
+ },
+ zipfile = None,
+ windows=['Karthika.py']
+)