7
7
8
8
app = modal .App (name = "link-scraper" ) #create an app called link-scrapper
9
9
10
- @app .function ()
11
- def get_links (url ):
12
- response = urllib .request .urlopen (url )
13
- html = response .read ().decode ("utf8" )
14
-
15
- links = []
10
+ playwright_image = modal .Image .debian_slim (python_version = "3.10" ).run_commands (
11
+ "apt-get update" ,
12
+ "apt-get install -y software-properties-common" ,
13
+ "apt-add-repository non-free" ,
14
+ "apt-add-repository contrib" ,
15
+ "pip install playwright==1.42.0" ,
16
+ "playwright install-deps chromium" ,
17
+ "playwright install chromium" ,
18
+ )
19
+
20
+ # Use custom containers to run Playwright package
21
+ # Playwright package launches a headless Chromium browser which can detect dynamic javascript
22
+ @app .function (image = playwright_image )
23
+ async def get_links (cur_url : str ):
24
+ from playwright .async_api import async_playwright
16
25
17
- for match in re .finditer ('href="(.*?)"' , html ): # find all strings that match the HTML hypterlink pattern
18
- links .append (match .group (1 ))
26
+ async with async_playwright () as p :
27
+ browser = await p .chromium .launch ()
28
+ page = await browser .new_page ()
29
+ await page .goto (cur_url )
30
+ links = await page .eval_on_selector_all ("a[href]" , "elements => elements.map(element => element.href)" )
31
+ await browser .close ()
19
32
20
- return links
33
+ print ("Links" , links )
34
+ return links
21
35
22
36
23
37
@app .local_entrypoint ()
24
- def main (url ):
25
- links = get_links .remote (url )
26
- print (links )
38
+ def main ():
39
+ urls = ["http://modal.com" , "http://github.com" ]
40
+ for links in get_links .map (urls ):
41
+ for link in links :
42
+ print (link )
27
43
28
44
29
45
0 commit comments