-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathL24.py
More file actions
122 lines (85 loc) · 4.2 KB
/
L24.py
File metadata and controls
122 lines (85 loc) · 4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import csv, requests
from bs4 import BeautifulSoup
filename = 'londonarticles.csv'
# gets the tree structure for a page
def getPage(url):
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "lxml")
return soup
# parses the page for each news item and extracts title, time and text
def parseNewsPage(url):
soup = getPage(url)
article = soup.find("article")
articleheader = article.find("div", "content-a")
title = articleheader.find('h1')
print "Title: " + title.text
time = articleheader.find('p', 'updated')
if time is None:
time = articleheader.find("span", "publication-time")
print "Time: " + time.getText()
standfirsttag = articleheader.find('div', 'stand-first')
text = standfirsttag.find('p').getText()
ptags = articleheader.findAll('p', recursive=False)
for p in ptags:
if p.get("class") is None:
text += p.text.strip()
print "Text: " + text
saveEntry(title.text, time.getText(), text)
# saves title, time and text of news item to csv
def saveEntry(titlestr, timestr, contentstr):
with open(filename, 'a') as csvfile:
csvwriter = csv.writer(csvfile, delimiter='|')
csvwriter.writerow([titlestr.encode('utf-8'), timestr.encode('utf-8'), contentstr.encode('utf-8')])
# parses the page with news listing and gets the url for each news item
def parseNewsListing(page):
articlespart = page.find("div", "search-results inner-a col-sm-6 col-md-8")
divs = articlespart.find_all("div", "teaser-image")
numarticleswritten = 0
for div in divs:
linkelement = div.find('a')
link = linkelement.get('href')
if link.startswith("/news"):
link = "http://www.london24.com" + link
print link
parseNewsPage(link)
numarticleswritten += 1
return numarticleswritten
# this function finds maxpage and starts sending requests to all pages upto maxpage
def navigatePages():
numarticlesread = 0
totalnumarticles = 0
totalnumarticleswritten = 0
pagenum = 122
while numarticlesread <= totalnumarticles:
print "PAGE: " + str(pagenum)
# retrieve the listings page
pageurl = "http://www.london24.com/home/search?sort=publishedDate_descending&numberOfItemsPerPage=40&submitted=true&toDate=02%2F05%2F2016&excludeSiteIds=%5B%5D&selectedCategories=%5B%5B%5B%5B%5B%5B%5B%5B%5B%5B%5B%5D%5D%5D%5D%5D%5D%5D%5D%5D%5D%5D&siteExternalID=london.d&excludedCategories=%5B%5B%5B%5B%5B%5B%5B%5B%5B%5B%5B%5D%5D%5D%5D%5D%5D%5D%5D%5D%5D%5D&category=tree_department.categorydimension.archant%3ACategory.General.Crime.NewsHard&includeSiteIds=%5BLondon.d%5D&distanceInMiles=0.0&siteId=2.3224&action=search&facetQueries=publishedDate%3A%5BNOW%2FDAY-7DAYS+TO+NOW%5D&facetQueries=publishedDate%3A%5BNOW%2FYEAR+TO+NOW%5D&facetQueries=publishedDate%3A%5BNOW%2FDAY-30DAYS+TO+NOW%5D&facetQueries=publishedDate%3A%5BNOW%2FDAY+TO+NOW%5D&numberOfItemsToSearchPerPage=10&facetFields=tree_department.categorydimension.archant&publishDateInterval=uk.co.polopoly.search.util.DateInterval%40750d907c&page=" + str(pagenum)
page = getPage(pageurl)
# parse listing, get article urls, retrieve article contents and write articles to csv
totalnumarticleswritten += parseNewsListing(page)
# update number of articles read
resultselement = page.find("div", "search-results-scope")
resultsstr = resultselement.find("p").getText()
strparts = resultsstr.split()
totalnumarticles = int(strparts[2].strip())
resultsrange = strparts[0].split('-')
numarticlesread = int(resultsrange[1])
print "Read " + str(numarticlesread) + " articles out of " + str(totalnumarticles)
pagenum += 1
print "Total number of articles written: " + totalnumarticleswritten
# this function stores the headline into a .csv-file
def initFile():
fobj = open(filename, 'w')
csvw = csv.writer(fobj, delimiter='|')
csvw.writerow(['title', 'time', 'content'])
fobj.close()
#### MAIN PROGRAM ####
def main():
# open and prepare file
initFile()
# parse the website and store the collected data into file
navigatePages()
##### CALL MAIN PROGRAM #####
if __name__ == '__main__':
main()