-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeedparsertest.py
44 lines (30 loc) · 1.08 KB
/
feedparsertest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#from feedgen.feed import FeedGenerator
import feedparser,html,re,requests,hashlib,listparser
from readability import Document
from xml.etree import ElementTree
import sys
def cleantext(text): #just wanna cleanup and remove html
clean = re.compile('<.*?>')
return html.unescape(re.sub(clean, '', text))
def extract_article(url): #rajouter choix automatique en fonction de la taille
response = requests.get(url)
doc = Document(response.text)
title=doc.title()
article=cleantext(doc.summary())
return([title,article])#retourne le nom de l'article puis le texte
def feed_hash_check(feed):
#regarder dans la bdd et retourne le dernier hash
print("s")
def feed_extractor(feed):
compil=[]
for x in feedparser.parse(feed)['entries']:
compil.append(extract_article(x['link']))
print((compil[0][0]))
lasthash = hashlib.md5((compil[0][0]).encode('utf-8')).hexdigest()
print(lasthash)
return(compil)
""""""
feed="https://www.lemonde.fr/rss/une.xml"
feed_extractor(feed)
"""
"""