Skip to content

Commit 0ab1ae3

Browse files
committed
Update scraper requirements version, xpath, tutorial
1 parent 27f7128 commit 0ab1ae3

File tree

9 files changed

+77
-77
lines changed

9 files changed

+77
-77
lines changed

scrape/living_social/scraper_app/items.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
class LivingSocialDeal(Item):
1616
"""Livingsocial container (dictionary-like object) for scraped data"""
1717
title = Field()
18-
description = Field()
1918
link = Field()
20-
category = Field()
2119
location = Field()
2220
original_price = Field()
2321
price = Field()
22+
end_date = Field()

scrape/living_social/scraper_app/models.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
Direct run will create the table.
1111
"""
1212

13-
from sqlalchemy import create_engine, Column, Integer, String
13+
from sqlalchemy import create_engine, Column, Integer, String, DateTime
1414
from sqlalchemy.ext.declarative import declarative_base
1515
from sqlalchemy.engine.url import URL
1616

@@ -28,19 +28,20 @@ def db_connect():
2828
"""
2929
return create_engine(URL(**settings.DATABASE))
3030

31+
3132
def create_deals_table(engine):
3233
""""""
3334
DeclarativeBase.metadata.create_all(engine)
3435

36+
3537
class Deals(DeclarativeBase):
3638
"""Sqlalchemy deals model"""
3739
__tablename__ = "deals"
3840

3941
id = Column(Integer, primary_key=True)
4042
title = Column('title', String)
41-
description = Column('description', String, nullable=True)
4243
link = Column('link', String, nullable=True)
4344
location = Column('location', String, nullable=True)
44-
category = Column('category', String, nullable=True)
45-
original_price = Column('original_price', String, nullable=True)
46-
price = Column('price', String, nullable=True)
45+
original_price = Column('original_price', Integer, nullable=True)
46+
price = Column('price', Integer, nullable=True)
47+
end_date = Column('end_date', DateTime, nullable=True)

scrape/living_social/scraper_app/settings.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212

1313
ITEM_PIPELINES = ['scraper_app.pipelines.LivingSocialPipeline']
1414

15-
DATABASE = {'drivername': 'postgres',
16-
'host': 'localhost',
17-
'port': '5432',
18-
'username': 'YOUR_USERNAME', # fill in your username here
19-
'password': 'YOUR_PASSWORD', # fill in your password here
20-
'database': 'scrape'}
15+
DATABASE = {
16+
'drivername': 'postgres',
17+
'host': 'localhost',
18+
'port': '5432',
19+
'username': 'YOUR_USERNAME', # fill in your username here
20+
'password': 'YOUR_PASSWORD', # fill in your password here
21+
'database': 'scrape'
22+
}

scrape/living_social/scraper_app/spiders/livingsocial_spider.py

+13-10
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,22 @@
1818

1919

2020
class LivingSocialSpider(BaseSpider):
21-
"""Spider for regularly updated livingsocial.com site, San Francisco page"""
21+
"""
22+
Spider for regularly updated livingsocial.com site, San Francisco page
23+
"""
2224
name = "livingsocial"
2325
allowed_domains = ["livingsocial.com"]
24-
start_urls = ["http://www.livingsocial.com/cities/15-san-francisco"]
26+
start_urls = ["https://www.livingsocial.com/cities/15-san-francisco"]
2527

2628
deals_list_xpath = '//li[@dealid]'
27-
item_fields = {'title': './/a/div[@class="deal-bottom"]/h3[@itemprop]/text()',
28-
'link': './/a/@href',
29-
'description': './/a/div[@class="deal-bottom"]/p/text()',
30-
'category': './/a/div[@class="deal-top"]/div[@class="deal-category"]/span/text()',
31-
'location': './/a/div[@class="deal-top"]/ul[@class="unstyled deal-info"]/li/text()',
32-
'original_price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-original"]/del/text()',
33-
'price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-price"]/text()'}
29+
item_fields = {
30+
'title': './/span[@itemscope]/meta[@itemprop="name"]/@content',
31+
'link': './/a/@href',
32+
'location': './/a/div[@class="deal-details"]/p[@class="location"]/text()',
33+
'original_price': './/a/div[@class="deal-prices"]/div[@class="deal-strikethrough-price"]/div[@class="strikethrough-wrapper"]/text()',
34+
'price': './/a/div[@class="deal-prices"]/div[@class="deal-price"]/text()',
35+
'end_date': './/span[@itemscope]/meta[@itemprop="availabilityEnds"]/@content'
36+
}
3437

3538
def parse(self, response):
3639
"""
@@ -45,7 +48,7 @@ def parse(self, response):
4548
selector = HtmlXPathSelector(response)
4649

4750
# iterate over deals
48-
for deal in selector.select(self.deals_list_xpath):
51+
for deal in selector.xpath(self.deals_list_xpath):
4952
loader = XPathItemLoader(LivingSocialDeal(), selector=deal)
5053

5154
# define processors

scrape/requirements.txt

+3-9
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
1-
SQLAlchemy==0.8.0b2
2-
Scrapy==0.16.4
3-
Twisted==12.3.0
4-
lxml==3.1beta1
5-
psycopg2==2.4.6
6-
pyOpenSSL==0.13
7-
requests==1.1.0
8-
w3lib==1.2
9-
zope.interface==4.0.3
1+
SQLAlchemy==0.9.8
2+
Scrapy==0.24.4
3+
psycopg2==2.5.4

website/_containers/scrape/2013-03-02-part-5.md

+10-11
Original file line numberDiff line numberDiff line change
@@ -56,24 +56,23 @@ lynnroot=# \connect scrape
5656
psql (9.1.4, server 9.1.3)
5757
You are now connected to database "scrape" as user "lynnroot".
5858
scrape=# select * from deals limit 5;
59-
id | title | description | link | location | category | original_price | price
60-
----+---------------------------------------+------------------------------------+----------------------------------------------------------------------------+--------------+------------+----------------+-------
61-
1 | Mini Box | Deck of Photo Playing Cards | /cities/1719-newyork-citywide/deals/614972-deck-of-photo-playing-cards | national | | 29 | 9
62-
2 | Paintball: LivingSocial Original: | Paintball + BBQ Day Trip | /cities/1719-newyork-citywide/deals/575448-paintball-bbq-day-trip | NYC Citywide | activities | | 69
63-
3 | Medieval Times | Medieval Times: Meal + Show Ticket | /cities/1719-newyork-citywide/deals/627242-medieval-times-meal-show-ticket | NYC Citywide | activities | 41 | 27
64-
4 | '80s Boat Cruise: LivingSocial Ori... | NYC Boat Cruise + '80s Concert | /cities/1719-newyork-citywide/deals/610320-nyc-boat-cruise-80s-concert | NYC Citywide | activities | | 29
65-
5 | New York Magazine | 50 Issues of New York Magazine | /cities/1719-newyork-citywide/deals/594056-50-issues-of-new-york-magazine | NYC Citywide | | 30 | 15
59+
id | title | link | location | original_price | price | end_date
60+
----+---------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+----------------+-------+---------------------
61+
1 | Three-Course Prix-Fixe Contemporary American Meal | https://www.livingsocial.com/deals/1365378-three-course-prix-fixe-contemporary-american-meal | Calistoga, CA | 132 | 66 | 2015-12-24 08:00:00
62+
2 | Thrilling Live-Action Escape Game for Six | https://www.livingsocial.com/deals/1370546-thrilling-live-action-escape-game-for-six | San Francisco, CA | 169 | 99 | 2015-02-08 08:00:00
63+
3 | $30 to Spend on Food and Drink | https://www.livingsocial.com/cities/15-san-francisco/deals/1234440-30-to-spend-on-food-and-drink | San Francisco, CA | 30 | 15 | 2015-07-29 11:59:00
64+
4 | $50 Toward Asian-Inspired Seafood, Steaks & More | https://www.livingsocial.com/deals/1333444-50-toward-asian-inspired-seafood-steaks-more | San Francisco, CA | 50 | 25 | 2015-02-19 12:59:00
65+
5 | Spa Packages with Jacuzzi and Pool Access | https://www.livingsocial.com/deals/1278806-spa-packages-with-jacuzzi-and-pool-access | San Francisco, CA | 120 | 80 | 2015-09-17 11:59:59
6666
(5 rows)
6767
```
6868

6969
Try a few of these select queries:
7070

7171
```psql
72-
scrape=# select * from deals where title like ('%Yoga');
73-
scrape=# select * from deals where description like ('%Yoga%');
74-
scrape=# select * from deals where description like ('%Dinner');
75-
scrape=# select link from deals where description like ('%Photography%');
7672
scrape=# select title from deals limit 30;
73+
scrape=# select link from deals where price < 50;
74+
scrape=# select title from deals where end_date < '2015-02-08';
75+
scrape=# select * from deals where title like ('%Yoga');
7776
```
7877

7978
Notice how the strings we’re searching for contains `%` – this is a wildcard character. This is essentially saying “find deals where the title contains “Yoga”. Learn more about [querying Postgres](http://www.postgresql.org/docs/8.4/static/tutorial-select.html).

website/_containers/scrape/2013-03-04-part-3.md

+23-21
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,19 @@ SPIDER_MODULES = ['scraper_app.spiders']
6161
We then define our database through a dictionary:
6262

6363
```python
64-
DATABASE = {'drivername': 'postgres',
65-
'host': 'localhost',
66-
'port': '5432',
67-
'username': 'lynnroot',
68-
'password': 'root',
69-
'database': 'scrape'}
64+
DATABASE = {
65+
'drivername': 'postgres',
66+
'host': 'localhost',
67+
'port': '5432',
68+
'username': 'YOUR_USERNAME',
69+
'password': 'YOUR_PASSWORD',
70+
'database': 'scrape'
71+
}
7072
```
7173

7274
The `drivername` is the type of database we're using – Postgres. Since we're using Postgres that we installed on our own computer, the location, or the `host` is `localhost`. The port is the default port that Postgres listens on.
7375

74-
The `username` is _your_ username for your machine. The `password` may not be needed, or may be the password used when setting up Postgres initially.
76+
The `username` is _your_ username for your machine. The `password` may not be needed (just an empty string, `'password': ''`), or may be the password used when setting up Postgres initially.
7577

7678
The `database` is the name of the database we created earlier, `postgres=# create database scrape;`.
7779

@@ -108,12 +110,14 @@ Last item I want to point out before we move on is the usage of the double astri
108110
So first, our dictionary looks like:
109111

110112
```python
111-
DATABASE = {'drivername': 'postgres',
112-
'host': 'localhost',
113-
'port': '5432',
114-
'username': 'lynnroot',
115-
'password': 'root',
116-
'database': 'scrape'}
113+
DATABASE = {
114+
'drivername': 'postgres',
115+
'host': 'localhost',
116+
'port': '5432',
117+
'username': 'lynn',
118+
'password': '',
119+
'database': 'scrape'
120+
}
117121
```
118122

119123
Then, the `URL()` function will parse out the elements, and create the following URL for the `create_engine()` function to read:
@@ -146,7 +150,7 @@ def create_deals_table(engine):
146150
Last, we define our actual table by inheriting from `DeclarativeBase` and setting up how we want to define each field we want to collect. We also have to import a few more things from SQLAlchemy:
147151

148152
```python
149-
from sqlalchemy import create_engine, Column, Integer, String
153+
from sqlalchemy import create_engine, Column, Integer, String, DateTime
150154

151155
# <--snip-->
152156

@@ -156,12 +160,11 @@ class Deals(DeclarativeBase):
156160

157161
id = Column(Integer, primary_key=True)
158162
title = Column('title', String)
159-
description = Column('description', String, nullable=True)
160163
link = Column('link', String, nullable=True)
161164
location = Column('location', String, nullable=True)
162-
category = Column('category', String, nullable=True)
163165
original_price = Column('original_price', String, nullable=True)
164166
price = Column('price', String, nullable=True)
167+
end_date = Column('end_date', DateTime, nullable=True)
165168
```
166169

167170
We give our class a table name, “deals”, as well as 8 fields. Each field will be mapped to a column in our table which it's created through `create_deals_table()`.
@@ -171,7 +174,7 @@ For each field, we define the type of field that it is, `Integer` for our primar
171174
All together:
172175

173176
```python
174-
from sqlalchemy import create_engine, Column, Integer, String
177+
from sqlalchemy import create_engine, Column, Integer, String, DateTime
175178
from sqlalchemy.ext.declarative import declarative_base
176179
from sqlalchemy.engine.url import URL
177180

@@ -200,12 +203,11 @@ class Deals(DeclarativeBase):
200203

201204
id = Column(Integer, primary_key=True)
202205
title = Column('title', String)
203-
description = Column('description', String, nullable=True)
204206
link = Column('link', String, nullable=True)
205207
location = Column('location', String, nullable=True)
206-
category = Column('category', String, nullable=True)
207-
original_price = Column('original_price', String, nullable=True)
208-
price = Column('price', String, nullable=True)
208+
original_price = Column('original_price', Integer, nullable=True)
209+
price = Column('price', Integer, nullable=True)
210+
end_date = Column('end_date', DateTime, nullable=True)
209211
```
210212

211213
Let’s wrap up with how we pipeline our scraped data to save to our database.

website/_containers/scrape/2013-03-05-part-2.md

+8-7
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,14 @@ class LivingSocialSpider(BaseSpider):
2727
start_urls = ["http://www.livingsocial.com/cities/15-san-francisco"]
2828

2929
deals_list_xpath = '//li[@dealid]'
30-
item_fields = {'title': './/a/div[@class="deal-bottom"]/h3[@itemprop]/text()',
31-
'link': './/a/@href',
32-
'description': './/a/div[@class="deal-bottom"]/p/text()',
33-
'category': './/a/div[@class="deal-top"]/div[@class="deal-category"]/span/text()',
34-
'location': './/a/div[@class="deal-top"]/ul[@class="unstyled deal-info"]/li/text()',
35-
'original_price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-original"]/del/text()',
36-
'price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-price"]/text()'}
30+
item_fields = {
31+
'title': './/span[@itemscope]/meta[@itemprop="name"]/@content',
32+
'link': './/a/@href',
33+
'location': './/a/div[@class="deal-details"]/p[@class="location"]/text()',
34+
'original_price': './/a/div[@class="deal-prices"]/div[@class="deal-strikethrough-price"]/div[@class="strikethrough-wrapper"]/text()',
35+
'price': './/a/div[@class="deal-prices"]/div[@class="deal-price"]/text()',
36+
'end_date': './/span[@itemscope]/meta[@itemprop="availabilityEnds"]/@content'
37+
}
3738
```
3839

3940
I’ve chosen to not build out the scaffolding with comments, but to throw this at you instead. Let’s walk it through.

website/_containers/scrape/2013-03-06-part-1.md

+5-6
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,11 @@ Let’s add some items that we actually want to collect. We assign them to `Fiel
5050
class LivingSocialDeal(Item):
5151
"""Livingsocial container (dictionary-like object) for scraped data"""
5252
title = Field()
53-
description = Field()
5453
link = Field()
55-
category = Field()
5654
location = Field()
5755
original_price = Field()
5856
price = Field()
57+
end_date = Field()
5958
```
6059

6160
Nothing too hard - that was it. In scrapy, there are no other field types, unlike Django. So, we’re sort of stuck with `Field()`.
@@ -65,15 +64,15 @@ Let’s play around with this in the Python terminal. Make sure your `ScrapeProj
6564
```bash
6665
>>> from scrapy.item import Item, Field
6766
>>> from items import LivingSocialDeal
68-
>>> deal = LivingSocialDeal(title="$20 off yoga classes", category="health")
67+
>>> deal = LivingSocialDeal(title="$20 off yoga classes", price="50")
6968
>>> print deal
70-
LivingSocialDeal(title='$20 off yoga classes', category='health')
69+
LivingSocialDeal(title='$20 off yoga classes', price='50')
7170
>>> deal['title']
7271
'$20 off yoga classes'
7372
>>> deal.get('title')
7473
'$20 off yoga classes'
75-
>>> deal['category']
76-
'health'
74+
>>> deal['price']
75+
'50'
7776
>>> deal['location'] = "New York"
7877
>>> deal['location']
7978
'New York'

0 commit comments

Comments
 (0)