Skip to content

Commit

Permalink
Fixed names and updated utils
Browse files Browse the repository at this point in the history
  • Loading branch information
miodeqqq committed Dec 3, 2016
1 parent 583af10 commit 19f2820
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 14 deletions.
8 changes: 2 additions & 6 deletions pdf_to_grobid.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import requests

from .utils import Colors, urls_dict
from .utils import Colors, urls_dict, headers

PDFS = sys.argv[1]
XMLS_PATH_DOWNLOAD = sys.argv[2]
Expand All @@ -18,7 +18,7 @@
class Grobid(object):
"""
Usage: ./pdf_to_grobid.py pdfs_dir grobid_output_dir grobid_url
For example: ./pdf_to_grobid.py pdfs_data grobid_output_data local
For example: ./pdf_to_grobid.py pdfs_data grobid_output_data grobid
"""

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -102,10 +102,6 @@ def process_pdfs_to_grobid(self):
'input': pdf
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
}

try:

grobid = session.post(
Expand Down
12 changes: 4 additions & 8 deletions pdf_to_tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@

import requests

from .utils import Colors, urls_dict
from .utils import Colors, urls_dict, headers

PDFS = sys.argv[1]
TXT_PATH_DOWNLOAD = sys.argv[2]


class Tika(object):
"""
Usage: ./pdf_file_to_tika.py pdfs_dir tika_output_dir tika_url
For example: ./pdf_file_to_grobid.py pdfs_data tika_output_data rs
Usage: ./pdf_to_tika.py pdfs_dir tika_output_dir tika_url
For example: ./pdf_to_tika.py pdfs_data tika_output_data tika
"""

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -96,10 +96,6 @@ def process_pdfs_to_tika(self):

pdf_file = open(file, 'rb')

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
}

try:

tika = session.put(
Expand Down Expand Up @@ -148,7 +144,7 @@ def process_pdfs_to_tika(self):
file_ext='.txt'), 'w') as tika_output:
tika_output.write(txt_file)

print('Saved grobid TXT output to file!')
print('Saved TIKA TXT output to file!')

print(Colors.BOLD + '________________________________________________' + Colors.ENDC)

Expand Down
4 changes: 4 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@ class Colors():
urls_dict = {
'grobid': 'http://localhost:1234/processFulltextDocument',
'tika': 'http://localhost:9876/tika',
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
}

0 comments on commit 19f2820

Please sign in to comment.