Skip to content

Commit 5d11511

Browse files
committed
QL source code added. README updated accordingly.
1 parent d7854b1 commit 5d11511

File tree

15 files changed

+3104
-0
lines changed

15 files changed

+3104
-0
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Qulac
22
A dataset on asking Questions for Lack of Clarity in open-domain information-seeking conversations.
33

4+
**\*\*\*\*\*\*\*\* New October 12th, 2019: Document retrieval code added \*\*\*\*\*\*\*\***
5+
46
**\*\*\*\*\*\*\*\* New August 2nd, 2019: Links to auxiliary data added \*\*\*\*\*\*\*\***
57

68
## Introduction
@@ -270,6 +272,17 @@ To help to decode the ID's we have also provided two simple code snippets that c
270272
for k in range(2,2+len(tokens)-4):
271273
hisotry_ids += (tokens[k],)
272274
return topic_id, facet_id, hisotry_ids, question_id, with_ans
275+
276+
## Code
277+
We have released the source code of the document retrieval component. As mentioned in [1], the retrieval model is an extension of the query likelihood (QL) model, where we give different weights to the original query and question and answer text.
278+
279+
The code is available under `./src/` directory. The main retrieval model can be found in `QL.py` file where it uses Cython function `ql_score` to calcuate the relevance score faster. Therefore, before running the code, you must compile and build the Cython package `ql_score` on your device.
280+
281+
**Build Cython package:** To build the Cython package on your device, please go to `./src/ql_score/` and run the following command: `python setup.py build_ext --inplace`
282+
283+
Furthermore, our code uses the custom index that we have built (and is available [here](http://ciir.cs.umass.edu/downloads/qulac/)), as well as ClueWeb term statistics file that is also available [here](http://ciir.cs.umass.edu/downloads/qulac/). The file `run_QL.ipynb` provides a Jupyter Notebook example of how to execute a query.
284+
285+
**NOTE:** The index is build using Krovetz stemmer. Therefore, in order to replicate the results of [1], you need to install the Krovetz Stemmer on your machine and use it in your own code. Once you install the stemmer, please modify `./src/utils/utils.py` file to use the stemmer as its default stemmer. To do so, follow the `TODO` comments in the file.
273286

274287

275288
## Citation

src/QL.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
import pandas as pd
2+
import numpy as np
3+
import re
4+
import nltk
5+
from utils.utils import *
6+
import time
7+
from ql_score import ql_score
8+
import pickle
9+
10+
class QL:
11+
alpha = 0.5
12+
mu=1500.
13+
14+
_inverted_index = {}
15+
16+
# data_root = './'
17+
18+
# _term_stats_path = data_root + 'clueweb_stats/term_stats.pkl'
19+
# _term_stats_porter_path = data_root + 'clueweb_stats/term_stats.porter.pkl'
20+
# _term_stats_krovetz_path = data_root + 'clueweb_stats/term_stats.krovetz.pkl'
21+
# _doc_stats_path = data_root + 'clueweb_stats/doc_lengths'
22+
# _index_path = data_root + 'data/topic_indexes/{}.pkl'
23+
24+
_mean_doc_len = 770.4786222801615
25+
_total_docs = 33836981
26+
_total_terms = 0
27+
28+
29+
def __init__(self, do_stemming, do_stopword_removal, data_root = './', load_stats=True):
30+
self.do_stemming = do_stemming
31+
self.do_stopword_removal = do_stopword_removal
32+
self.data_root = data_root
33+
self._stopwords = nltk.corpus.stopwords.words('english')
34+
35+
self._term_stats_path = self.data_root + 'clueweb_stats/term_stats.pkl'
36+
self._term_stats_porter_path = self.data_root + 'clueweb_stats/term_stats.porter.pkl'
37+
self._term_stats_krovetz_path = self.data_root + 'clueweb_stats/term_stats.krovetz.pkl'
38+
self._doc_stats_path = self.data_root + 'clueweb_stats/doc_lengths'
39+
self._index_path = self.data_root + 'topic_indexes/{}.pkl'
40+
41+
if load_stats and self.do_stemming:
42+
self._term_stats = pd.read_pickle(self._term_stats_krovetz_path)[1].to_dict()
43+
elif load_stats:
44+
self._term_stats = pd.read_pickle(self._term_stats_path)[1].to_dict()
45+
46+
for k in self._term_stats:
47+
self._total_terms += self._term_stats[k]
48+
49+
if self.do_stopword_removal:
50+
for stopw in self._stopwords:
51+
self._total_terms -= self._term_stats[stopw] if stopw in self._term_stats else 0
52+
53+
54+
def _stopword_removal(self, tokens):
55+
return [word for word in tokens if word not in self._stopwords]
56+
57+
58+
def load_doc_stats(self):
59+
doc_lengths = pd.read_csv(self._doc_stats_path, sep='\t', header=None)
60+
self._mean_doc_len = doc_lengths[2].mean()
61+
self._total_docs = len(doc_lengths.index)
62+
63+
64+
def load_topic_index(self, topic_id):
65+
with open(self._index_path.format(topic_id), 'rb') as inp:
66+
self._inverted_index = pickle.load(inp)
67+
if self.do_stopword_removal:
68+
for doc in self._inverted_index:
69+
for stopw in self._stopwords:
70+
if stopw in self._inverted_index[doc]['terms']:
71+
self._inverted_index[doc]['length'] -= self._inverted_index[doc]['terms'][stopw]
72+
73+
74+
def update_query_lang_model(self, query, question, answer):
75+
output = {}
76+
77+
query_tokens, qlen = self._preprocess(query)
78+
if type(question) == str:
79+
other_tokens, other_len = self._preprocess(question + ' ' + answer)
80+
else:
81+
other_tokens, other_len = self._preprocess(question + answer)
82+
# answer_tokens, ans_len = self._preprocess(answer)
83+
84+
all_tokens = set(list(query_tokens.keys()) + list(other_tokens.keys()))
85+
86+
for t in all_tokens:
87+
try:
88+
qfreq = float(query_tokens[t]) / qlen
89+
except KeyError:
90+
qfreq = 0
91+
try:
92+
qafreq = float(other_tokens[t]) / other_len
93+
except KeyError:
94+
qafreq = 0
95+
output[t] = self.alpha * qfreq + (1 - self.alpha) * qafreq
96+
# print(t, output[t])
97+
98+
self._query_lm = output
99+
100+
101+
def _preprocess(self, text):
102+
if type(text) == str:
103+
if self.do_stemming:
104+
text_tokens = tokenize_and_stem(text)
105+
else:
106+
text_tokens = tokenize_only(text)
107+
108+
if self.do_stopword_removal:
109+
text_tokens = self._stopword_removal(text_tokens)
110+
else:
111+
text_tokens = text
112+
113+
output = dict()
114+
for t in text_tokens:
115+
if t not in output:
116+
output[t] = 0.
117+
output[t] += 1.
118+
119+
return output, len(text_tokens)
120+
121+
122+
def _add_doc_to_inverted_index_if_not_existing(self, document_id, document):
123+
if document_id not in self._inverted_index:
124+
document_tokens, length = self._preprocess(document)
125+
self._inverted_index[document_id] = {'terms': document_tokens,
126+
'length': length}
127+
# try:
128+
# # print(document_tokens)
129+
# self._inverted_index[document_id]['terms'] = document_tokens
130+
# except KeyError:
131+
# self._inverted_index[document_id]['terms'] = {}
132+
# self._inverted_index[document_id]['length'] = len(document_tokens)
133+
134+
135+
def get_result_list(self):
136+
output = []
137+
for doc_id in self._inverted_index:
138+
output.append((doc_id, self.get_interpolated_score(doc_id)))
139+
return output
140+
141+
142+
def get_result_df(self, topk, query_id):
143+
df = pd.DataFrame(self.get_result_list()).sort_values(1, ascending=False).head(topk)
144+
df['record_id'] = query_id
145+
return df
146+
147+
148+
def get_interpolated_score(self, document_id):
149+
doc_inf = self._inverted_index[document_id]
150+
doc_len = doc_inf['length']
151+
152+
score = 0.
153+
for t in self._query_lm:
154+
try:
155+
dfreq = doc_inf['terms'][t]
156+
except KeyError:
157+
dfreq = 0
158+
try:
159+
nq = self._term_stats[t]
160+
except KeyError:
161+
nq = 0.
162+
# qafreq = float(other_tokens.count(t)) / len(other_tokens)
163+
# # print(t, qfreq, qafreq)
164+
165+
166+
# # q_score = self.alpha * qfreq + (1 - self.alpha) * qafreq
167+
# # print('qscore', q_score)
168+
# d_score = float(dfreq) / (self.mu + doc_len)
169+
# d_score += (self.mu / (self.mu + doc_len)) * (float(nq) / self._total_terms)
170+
# # print('dscore',d_score)
171+
# if d_score > 0:
172+
# score += q_score * np.log(d_score)
173+
# else:
174+
# print('This terms returns zero document frequency: ', t) nq = 0
175+
176+
q_score = self._query_lm[t] #float(query_tokens.count(t)) / len(query_tokens)
177+
#
178+
score += ql_score.ql_score_f(q_score, dfreq, self.mu, doc_len, nq, self._total_terms)
179+
180+
# qafreq = float(other_tokens.count(t)) / len(other_tokens)
181+
# # print(t, qfreq, qafreq)
182+
183+
184+
# # q_score = self.alpha * qfreq + (1 - self.alpha) * qafreq
185+
# # print('qscore', q_score)
186+
187+
# print(dfreq,self.mu, doc_len,nq, self._total_terms)
188+
# #old
189+
# d_score = float(dfreq) / (self.mu + doc_len)
190+
# d_score += (self.mu / (self.mu + doc_len)) * (float(nq) / self._total_terms)
191+
192+
# d_score = (float(dfreq) + (self.mu *(float(nq)/self._total_terms))/(doc_len+)
193+
194+
# # print('dscore',d_score)
195+
# print(d_score)
196+
# if d_score > 0:
197+
# score += q_score * np.log(d_score)
198+
# else:
199+
# print('This terms returns zero document frequency: ', t)
200+
201+
return score

src/ql_score/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Install
2+
3+
Run this code on your system to make the cython executable file:
4+
5+
`python setup.py build_ext --inplace`
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)