From a3b84ab497ac026ac33f2be2aba2abbe4082da57 Mon Sep 17 00:00:00 2001 From: Nikolai Sachok Date: Tue, 25 Jun 2019 00:31:47 +0300 Subject: [PATCH] Fixed similar apps parsing Fixes parsing of similar apps details. Looks like Google obfuscated CSS attributes. The update finds apps by analysing links of page with regexp. --- play_scraper/api.py | 4 ++-- play_scraper/scraper.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/play_scraper/api.py b/play_scraper/api.py index 9f58c33..88b156a 100644 --- a/play_scraper/api.py +++ b/play_scraper/api.py @@ -79,7 +79,7 @@ def search(query, page=None, detailed=False, hl='en', gl='us'): return s.search(query, page, detailed) -def similar(app_id, detailed=False, hl='en', gl='us'): +def similar(app_id, detailed=True, hl='en', gl='us'): """Sends a GET request, follows the redirect, and retrieves a list of applications similar to the specified app. @@ -88,7 +88,7 @@ def similar(app_id, detailed=False, hl='en', gl='us'): :return: a list of similar apps """ s = scraper.PlayScraper(hl, gl) - return s.similar(app_id) + return s.similar(app_id, detailed) def categories(hl='en', gl='us', ignore_promotions=True): diff --git a/play_scraper/scraper.py b/play_scraper/scraper.py index ffae983..88a8085 100644 --- a/play_scraper/scraper.py +++ b/play_scraper/scraper.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import logging +import re try: from urllib import quote_plus from urlparse import urljoin @@ -54,15 +55,16 @@ def _parse_multiple_apps(self, list_response): :param list_response: the Response object from a list request :return: a list of app dictionaries """ - list_strainer = SoupStrainer('span', - {'class': 'preview-overlay-container'}) + list_strainer = SoupStrainer('a') soup = BeautifulSoup(list_response.content, 'lxml', - from_encoding='utf8', - parse_only=list_strainer) + parse_only=list_strainer, + from_encoding='utf8') + + # getting app_ids from href attributes of app links on the similar apps page + app_tags = soup.find_all(href=re.compile("\/store\/apps\/details\?id=")) + app_ids = list(set([link.attrs["href"][23:] for link in app_tags])) # converting the resulting list to set to get only unique values and then vice versa as multi_futures_app_request requires a list - app_ids = [x.attrs['data-docid'] - for x in soup.select('span.preview-overlay-container')] return multi_futures_app_request(app_ids, params=self.params) def details(self, app_id):