forked from lynn737/GreenThumb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
47 lines (36 loc) · 1.49 KB
/
main.py
File metadata and controls
47 lines (36 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
import re
import json
from bs4 import BeautifulSoup
response = requests.get('https://www.happyhouseplants.co.uk/blogs/houseplant-blog')
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('a', class_='article__title')[:5] # Only show the first 5 articles
def get_first_two_paragraphs(content):
if content:
paragraphs = content.find_all('p') # Find all <p> tags
# Get the first two paragraphs if they exist
first_two = ' '.join([p.get_text().strip() for p in paragraphs[:2] if p.get_text().strip()])
return first_two if first_two else "No text content found"
return "No text content found"
article_list = []
for article in articles:
article_text = article.get_text().strip()
url = f"https://www.happyhouseplants.co.uk{article['href']}"
article_response = requests.get(url)
article_soup = BeautifulSoup(article_response.text, 'lxml')
date_tag = article_soup.find('time')
if date_tag and date_tag.has_attr('datetime'):
publication_date = date_tag['datetime'][:10]
else:
publication_date = "Date not found"
content = article_soup.find('div', class_='rte')
first_two = get_first_two_paragraphs(content)[:120] + "..."
article_info = {
'title': article_text,
'date': publication_date,
'sentence': first_two,
'url': url
}
article_list.append(article_info)
with open('./components/articles.json', 'w') as f:
json.dump(article_list, f, indent=2)