Skip to content

Commit 16d882a

Browse files
committed
Added multiple URLS
1 parent c69ebf2 commit 16d882a

File tree

1 file changed

+28
-12
lines changed

1 file changed

+28
-12
lines changed

scrape.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,39 @@
77

88
app = modal.App(name="link-scraper") #create an app called link-scrapper
99

10-
@app.function()
11-
def get_links(url):
12-
response = urllib.request.urlopen(url)
13-
html = response.read().decode("utf8")
14-
15-
links = []
10+
playwright_image = modal.Image.debian_slim(python_version="3.10").run_commands(
11+
"apt-get update",
12+
"apt-get install -y software-properties-common",
13+
"apt-add-repository non-free",
14+
"apt-add-repository contrib",
15+
"pip install playwright==1.42.0",
16+
"playwright install-deps chromium",
17+
"playwright install chromium",
18+
)
19+
20+
# Use custom containers to run Playwright package
21+
# Playwright package launches a headless Chromium browser which can detect dynamic javascript
22+
@app.function(image = playwright_image)
23+
async def get_links(cur_url: str):
24+
from playwright.async_api import async_playwright
1625

17-
for match in re.finditer('href="(.*?)"', html): # find all strings that match the HTML hypterlink pattern
18-
links.append(match.group(1))
26+
async with async_playwright() as p:
27+
browser = await p.chromium.launch()
28+
page = await browser.new_page()
29+
await page.goto(cur_url)
30+
links = await page.eval_on_selector_all("a[href]", "elements => elements.map(element => element.href)")
31+
await browser.close()
1932

20-
return links
33+
print("Links", links)
34+
return links
2135

2236

2337
@app.local_entrypoint()
24-
def main(url):
25-
links = get_links.remote(url)
26-
print(links)
38+
def main():
39+
urls = ["http://modal.com", "http://github.com"]
40+
for links in get_links.map(urls):
41+
for link in links:
42+
print(link)
2743

2844

2945

0 commit comments

Comments
 (0)