diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..08da43a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.tmp +github_data \ No newline at end of file diff --git a/download_repo_text.py b/download_repo_text.py index af1ec1c..11beef5 100644 --- a/download_repo_text.py +++ b/download_repo_text.py @@ -12,7 +12,7 @@ import argparse import subprocess from itertools import repeat - +import copy bad_extensions = [ 'app', 'bin', @@ -164,7 +164,7 @@ def get_content(f): def _process_repo(repo_data, repodir): - out = None + out = [] # get metadata name, stars, lang = repo_data meta = {'repo_name': name, 'stars': stars, 'repo_language': lang} @@ -192,13 +192,14 @@ def _process_repo(repo_data, repodir): text_outputs.append(None) for i in range(len(files)): text = text_outputs[i] + meta_ind = copy.deepcopy(meta) if text is not None: - meta['file_name'] = filenames[i] - meta['mime_type'] = extensions[i] + meta_ind['file_name'] = filenames[i] + meta_ind['mime_type'] = extensions[i] if out is None: - out = [[text, meta]] + out = [[text, meta_ind]] else: - out.append([text, meta]) + out.append([text,meta_ind]) shutil.rmtree(repodir, ignore_errors=True) except TimeoutError: print(f"Processing for {name} timed out") @@ -226,7 +227,7 @@ def process_repo_list(repo_data, clone_timeout, processing_timeout): p.kill() shutil.rmtree(f'{repodir}/.git', ignore_errors=True) # extracts text files from repo and returns them as list : [[text, metadata], ... ] - out = process_repo(repo_data, repodir, processing_timeout=processing_timeout) + out = _process_repo(repo_data, repodir)#, processing_timeout=processing_timeout) except Exception: err = traceback.format_exc() if verbose: @@ -315,4 +316,4 @@ def process_args(): success_hist.append((not_none / len(repos_out)) * 100) success_rate = sum(success_hist) / len(success_hist) pbar.set_postfix({"Success Rate": success_rate}) - ar.commit() # final commit + ar.commit() # final commit \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8abed81..9a9ae70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,3 @@ requests=='2.21.0' tqdm=='4.47.0' joblib=='0.16.0' fire=='0.3.1' -python-magic