NewsChain/article.py at master · kkjdroid/NewsChain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from markov import markov
from uuid import uuid4
import hashlib
class article:
    def __init__(self, d = None):
        if d:
            self.__dict__ = d
            return
        self.headline = markov()
        image_urls = image.get_image_urls(self.headline)
        self.guid = uuid4().hex
        self.images = []
        for _ in image_urls:
            img = image(_, self.guid)
            self.images.append(('./static/images/{}/{}'.format(self.guid,img.filename), img.caption))
        return None

class image:
    def __init__(self, url, guid):
        self.filename = ''
        self.article = guid
        self.download_image(url)
        self.caption = self.get_caption(url)

    def download_image(self, url):
        import urllib.request
        import os
        import http
        path = os.getcwd()
        directory = 'static/images/{}'.format(self.article)
        os.makedirs(directory, exist_ok=True)
        os.chdir(directory)
        self.filename = 'image{:08d}'.format(len(os.listdir()))
        try:
            urllib.request.urlretrieve(url, self.filename)
        except urllib.error.HTTPError:
            print(url)
        except http.client.RemoteDisconnected:
            pass
        except ValueError:
            pass
        except:
            pass
        os.chdir(path)
        return

    @staticmethod
    def get_image_urls(line, limit = 10):
        import time
        import urllib
        from selenium import webdriver
        from pyvirtualdisplay import Display
        line = urllib.parse.quote_plus(line)
        query = 'http:/www.google.com/search?tbm=isch&tbs=sur:fmc&*&q={}'.format(line)
        display = Display(visible=0, size=(800,600))
        display.start()
        browser = webdriver.Chrome()
        browser.get(query)
        time.sleep(1)
        elems = [_.get_attribute('href') for _ in browser.find_elements_by_class_name('rg_l')] # TODO: get link to page as well as direct link
        #_ = urllib.parse.unquote_plus(_)
        return list(set([urllib.parse.unquote(_[_.index('=') + 1:_.index('&')]) for _ in elems]))[:limit]
        elems = [_.get_attribute('href') for _ in browser.find_elements_by_class_name('rg_l')]
        elems = list(set(elems))[:limit]
        return list(map(image.get_image_from_link, elems))

    @staticmethod
    def get_caption(url = 'https://static.pexels.com/photos/104827/cat-pet-animal-domestic-104827.jpeg'):
        import urllib.request
        import http
        from bs4 import BeautifulSoup
        query = 'https://www.google.com/searchbyimage?image_url={}'.format(urllib.parse.quote(url, safe=''))
        headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20120101 Firefox/33.0' }
        req = urllib.request.Request(query, None, headers)
        with urllib.request.urlopen(req) as response:
            bs = BeautifulSoup(response.read(), 'html.parser')
            try:
                return bs.find_all('a', {'class': '_gUb'})[0].get_text(' ', strip = True)
            except IndexError:
                return ''

    """@staticmethod
    def get_image_urls(line, limit = 5):
        import urllib.request
        from bs4 import BeautifulSoup
        query = 'http://www.google.com/search?tbm=isch&tbs=sur:fmc&*&q={}'.format(urllib.parse.quote_plus(line, safe=''))
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20120101 Firefox/33.0'}
        req = urllib.request.Request(query, None, headers)
        elems = []
        unquoted = []
        with urllib.request.urlopen(req) as response:
            bs = BeautifulSoup(response.read(), 'html.parser')
            elems = bs.find_all('a', {'class': 'rg_l'})
            for _ in elems:
                print(_)
                unquoted.append(urllib.parse.unquote_plus(_.get('href')))
        return unquoted"""

    @staticmethod
    def get_image_from_link(s):
        import urllib
        url = urllib.parse.urlparse(s)
        query = url.query
        qs = urllib.parse.parse_qs(query)
        imgurls = qs.get('imgurl')
        imgurl = next(iter(imgurls), None)
        unquoted = urllib.parse.unquote_plus(imgurl)
        return unquoted