Update scraper requirements version, xpath, tutorial

econchick · econchick · commit 0ab1ae3ab854 · 2015-01-18T12:52:54.000-08:00
diff --git a/scrape/living_social/scraper_app/items.py b/scrape/living_social/scraper_app/items.py
@@ -15,9 +15,8 @@
 class LivingSocialDeal(Item):
     """Livingsocial container (dictionary-like object) for scraped data"""
     title = Field()
-    description = Field()
     link = Field()
-    category = Field()
     location = Field()
     original_price = Field()
     price = Field()
+    end_date = Field()
diff --git a/scrape/living_social/scraper_app/models.py b/scrape/living_social/scraper_app/models.py
@@ -10,7 +10,7 @@
 Direct run will create the table.
 """
 
-from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy import create_engine, Column, Integer, String, DateTime
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.engine.url import URL
 
@@ -28,19 +28,20 @@ def db_connect():
     """
     return create_engine(URL(**settings.DATABASE))
 
+
 def create_deals_table(engine):
     """"""
     DeclarativeBase.metadata.create_all(engine)
 
+
 class Deals(DeclarativeBase):
     """Sqlalchemy deals model"""
     __tablename__ = "deals"
 
     id = Column(Integer, primary_key=True)
     title = Column('title', String)
-    description = Column('description', String, nullable=True)
     link = Column('link', String, nullable=True)
     location = Column('location', String, nullable=True)
-    category = Column('category', String, nullable=True)
-    original_price = Column('original_price', String, nullable=True)
-    price = Column('price', String, nullable=True)
+    original_price = Column('original_price', Integer, nullable=True)
+    price = Column('price', Integer, nullable=True)
+    end_date = Column('end_date', DateTime, nullable=True)
diff --git a/scrape/living_social/scraper_app/settings.py b/scrape/living_social/scraper_app/settings.py
@@ -12,9 +12,11 @@
 
 ITEM_PIPELINES = ['scraper_app.pipelines.LivingSocialPipeline']
 
-DATABASE = {'drivername': 'postgres',
-            'host': 'localhost',
-            'port': '5432',
-            'username': 'YOUR_USERNAME', # fill in your username here
-            'password': 'YOUR_PASSWORD', # fill in your password here
-            'database': 'scrape'}
+DATABASE = {
+    'drivername': 'postgres',
+    'host': 'localhost',
+    'port': '5432',
+    'username': 'YOUR_USERNAME',  # fill in your username here
+    'password': 'YOUR_PASSWORD',  # fill in your password here
+    'database': 'scrape'
+}
diff --git a/scrape/living_social/scraper_app/spiders/livingsocial_spider.py b/scrape/living_social/scraper_app/spiders/livingsocial_spider.py
@@ -18,19 +18,22 @@
 
 
 class LivingSocialSpider(BaseSpider):
-    """Spider for regularly updated livingsocial.com site, San Francisco page"""
+    """
+    Spider for regularly updated livingsocial.com site, San Francisco page
+    """
     name = "livingsocial"
     allowed_domains = ["livingsocial.com"]
-    start_urls = ["http://www.livingsocial.com/cities/15-san-francisco"]
+    start_urls = ["https://www.livingsocial.com/cities/15-san-francisco"]
 
     deals_list_xpath = '//li[@dealid]'
-    item_fields = {'title': './/a/div[@class="deal-bottom"]/h3[@itemprop]/text()',
-                   'link': './/a/@href',
-                   'description': './/a/div[@class="deal-bottom"]/p/text()',
-                   'category': './/a/div[@class="deal-top"]/div[@class="deal-category"]/span/text()',
-                   'location': './/a/div[@class="deal-top"]/ul[@class="unstyled deal-info"]/li/text()',
-                   'original_price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-original"]/del/text()',
-                   'price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-price"]/text()'}
+    item_fields = {
+        'title': './/span[@itemscope]/meta[@itemprop="name"]/@content',
+        'link': './/a/@href',
+        'location': './/a/div[@class="deal-details"]/p[@class="location"]/text()',
+        'original_price': './/a/div[@class="deal-prices"]/div[@class="deal-strikethrough-price"]/div[@class="strikethrough-wrapper"]/text()',
+        'price': './/a/div[@class="deal-prices"]/div[@class="deal-price"]/text()',
+        'end_date': './/span[@itemscope]/meta[@itemprop="availabilityEnds"]/@content'
+    }
 
     def parse(self, response):
         """
@@ -45,7 +48,7 @@ def parse(self, response):
         selector = HtmlXPathSelector(response)
 
         # iterate over deals
-        for deal in selector.select(self.deals_list_xpath):
+        for deal in selector.xpath(self.deals_list_xpath):
             loader = XPathItemLoader(LivingSocialDeal(), selector=deal)
 
             # define processors
diff --git a/scrape/requirements.txt b/scrape/requirements.txt
@@ -1,9 +1,3 @@
-SQLAlchemy==0.8.0b2
-Scrapy==0.16.4
-Twisted==12.3.0
-lxml==3.1beta1
-psycopg2==2.4.6
-pyOpenSSL==0.13
-requests==1.1.0
-w3lib==1.2
-zope.interface==4.0.3
+SQLAlchemy==0.9.8
+Scrapy==0.24.4
+psycopg2==2.5.4
diff --git a/website/_containers/scrape/2013-03-02-part-5.md b/website/_containers/scrape/2013-03-02-part-5.md
@@ -56,24 +56,23 @@ lynnroot=# \connect scrape
 psql (9.1.4, server 9.1.3)
 You are now connected to database "scrape" as user "lynnroot".
 scrape=# select * from deals limit 5;
- id |                 title                 |            description             |                                    link                                    |   location   |  category  | original_price | price
-----+---------------------------------------+------------------------------------+----------------------------------------------------------------------------+--------------+------------+----------------+-------
-  1 | Mini Box                              | Deck of Photo Playing Cards        | /cities/1719-newyork-citywide/deals/614972-deck-of-photo-playing-cards     | national     |            | 29             |  9
-  2 | Paintball: LivingSocial Original:     | Paintball + BBQ Day Trip           | /cities/1719-newyork-citywide/deals/575448-paintball-bbq-day-trip          | NYC Citywide | activities |                |  69
-  3 | Medieval Times                        | Medieval Times: Meal + Show Ticket | /cities/1719-newyork-citywide/deals/627242-medieval-times-meal-show-ticket | NYC Citywide | activities | 41             |  27
-  4 | '80s Boat Cruise: LivingSocial Ori... | NYC Boat Cruise + '80s Concert     | /cities/1719-newyork-citywide/deals/610320-nyc-boat-cruise-80s-concert     | NYC Citywide | activities |                |  29
-  5 | New York Magazine                     | 50 Issues of New York Magazine     | /cities/1719-newyork-citywide/deals/594056-50-issues-of-new-york-magazine  | NYC Citywide |            | 30             |  15
+ id |                       title                       |                                               link                                               |     location      | original_price | price |      end_date
+----+---------------------------------------------------+--------------------------------------------------------------------------------------------------+-------------------+----------------+-------+---------------------
+  1 | Three-Course Prix-Fixe Contemporary American Meal | https://www.livingsocial.com/deals/1365378-three-course-prix-fixe-contemporary-american-meal     | Calistoga, CA     |  132           | 66    | 2015-12-24 08:00:00
+  2 | Thrilling Live-Action Escape Game for Six         | https://www.livingsocial.com/deals/1370546-thrilling-live-action-escape-game-for-six             | San Francisco, CA |  169           | 99    | 2015-02-08 08:00:00
+  3 | $30 to Spend on Food and Drink                    | https://www.livingsocial.com/cities/15-san-francisco/deals/1234440-30-to-spend-on-food-and-drink | San Francisco, CA |  30            | 15    | 2015-07-29 11:59:00
+  4 | $50 Toward Asian-Inspired Seafood, Steaks & More  | https://www.livingsocial.com/deals/1333444-50-toward-asian-inspired-seafood-steaks-more          | San Francisco, CA |  50            | 25    | 2015-02-19 12:59:00
+  5 | Spa Packages with Jacuzzi and Pool Access         | https://www.livingsocial.com/deals/1278806-spa-packages-with-jacuzzi-and-pool-access             | San Francisco, CA |  120           | 80    | 2015-09-17 11:59:59
 (5 rows)
 ```
 
 Try a few of these select queries:
 
 ```psql
-scrape=# select * from deals where title like ('%Yoga');
-scrape=# select * from deals where description like ('%Yoga%');
-scrape=# select * from deals where description like ('%Dinner');
-scrape=# select link from deals where description like ('%Photography%');
 scrape=# select title from deals limit 30;
+scrape=# select link from deals where price < 50;
+scrape=# select title from deals where end_date < '2015-02-08';
+scrape=# select * from deals where title like ('%Yoga');
 ```
 
 Notice how the strings we’re searching for contains `%` – this is a wildcard character. This is essentially saying “find deals where the title contains “Yoga”. Learn more about [querying Postgres](http://www.postgresql.org/docs/8.4/static/tutorial-select.html).
diff --git a/website/_containers/scrape/2013-03-04-part-3.md b/website/_containers/scrape/2013-03-04-part-3.md
@@ -61,17 +61,19 @@ SPIDER_MODULES = ['scraper_app.spiders']
 We then define our database through a dictionary:
 
 ```python
-DATABASE = {'drivername': 'postgres',
-            'host': 'localhost',
-            'port': '5432',
-            'username': 'lynnroot',
-            'password': 'root',
-            'database': 'scrape'}
+DATABASE = {
+    'drivername': 'postgres',
+    'host': 'localhost',
+    'port': '5432',
+    'username': 'YOUR_USERNAME',
+    'password': 'YOUR_PASSWORD',
+    'database': 'scrape'
+}
 ```
 
 The `drivername` is the type of database we're using – Postgres.  Since we're using Postgres that we installed on our own computer, the location, or the `host` is `localhost`.  The port is the default port that Postgres listens on.
 
-The `username` is _your_ username for your machine.  The `password` may not be needed, or may be the password used when setting up Postgres initially.
+The `username` is _your_ username for your machine.  The `password` may not be needed (just an empty string, `'password': ''`), or may be the password used when setting up Postgres initially.
 
 The `database` is the name of the database we created earlier, `postgres=#  create database scrape;`.
 
@@ -108,12 +110,14 @@ Last item I want to point out before we move on is the usage of the double astri
 So first, our dictionary looks like:
 
 ```python
-DATABASE = {'drivername': 'postgres',
-            'host': 'localhost',
-            'port': '5432',
-            'username': 'lynnroot',
-            'password': 'root',
-            'database': 'scrape'}
+DATABASE = {
+    'drivername': 'postgres',
+    'host': 'localhost',
+    'port': '5432',
+    'username': 'lynn',
+    'password': '',
+    'database': 'scrape'
+}
 ```
 
 Then, the `URL()` function will parse out the elements, and create the following URL for the `create_engine()` function to read:
@@ -146,7 +150,7 @@ def create_deals_table(engine):
 Last, we define our actual table by inheriting from `DeclarativeBase` and setting up how we want to define each field we want to collect.  We also have to import a few more things from SQLAlchemy:
 
 ```python
-from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy import create_engine, Column, Integer, String, DateTime
 
 # <--snip-->
 
@@ -156,12 +160,11 @@ class Deals(DeclarativeBase):
 
     id = Column(Integer, primary_key=True)
     title = Column('title', String)
-    description = Column('description', String, nullable=True)
     link = Column('link', String, nullable=True)
     location = Column('location', String, nullable=True)
-    category = Column('category', String, nullable=True)
     original_price = Column('original_price', String, nullable=True)
     price = Column('price', String, nullable=True)
+    end_date = Column('end_date', DateTime, nullable=True)
 ```
 
 We give our class a table name, “deals”, as well as 8 fields. Each field will be mapped to a column in our table which it's created through `create_deals_table()`.
@@ -171,7 +174,7 @@ For each field, we define the type of field that it is, `Integer` for our primar
 All together:
 
 ```python
-from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy import create_engine, Column, Integer, String, DateTime
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.engine.url import URL
 
@@ -200,12 +203,11 @@ class Deals(DeclarativeBase):
 
     id = Column(Integer, primary_key=True)
     title = Column('title', String)
-    description = Column('description', String, nullable=True)
     link = Column('link', String, nullable=True)
     location = Column('location', String, nullable=True)
-    category = Column('category', String, nullable=True)
-    original_price = Column('original_price', String, nullable=True)
-    price = Column('price', String, nullable=True)
+    original_price = Column('original_price', Integer, nullable=True)
+    price = Column('price', Integer, nullable=True)
+    end_date = Column('end_date', DateTime, nullable=True)
 ```
 
 Let’s wrap up with how we pipeline our scraped data to save to our database.
diff --git a/website/_containers/scrape/2013-03-05-part-2.md b/website/_containers/scrape/2013-03-05-part-2.md
@@ -27,13 +27,14 @@ class LivingSocialSpider(BaseSpider):
 	start_urls = ["http://www.livingsocial.com/cities/15-san-francisco"]
 
     deals_list_xpath = '//li[@dealid]'
-    item_fields = {'title': './/a/div[@class="deal-bottom"]/h3[@itemprop]/text()',
-                   'link': './/a/@href',
-                   'description': './/a/div[@class="deal-bottom"]/p/text()',
-                   'category': './/a/div[@class="deal-top"]/div[@class="deal-category"]/span/text()',
-                   'location': './/a/div[@class="deal-top"]/ul[@class="unstyled deal-info"]/li/text()',
-                   'original_price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-original"]/del/text()',
-                   'price': './/a/div[@class="deal-bottom"]/ul[@class="unstyled deal-info"]/li[@class="deal-price"]/text()'}
+    item_fields = {
+        'title': './/span[@itemscope]/meta[@itemprop="name"]/@content',
+        'link': './/a/@href',
+        'location': './/a/div[@class="deal-details"]/p[@class="location"]/text()',
+        'original_price': './/a/div[@class="deal-prices"]/div[@class="deal-strikethrough-price"]/div[@class="strikethrough-wrapper"]/text()',
+        'price': './/a/div[@class="deal-prices"]/div[@class="deal-price"]/text()',
+        'end_date': './/span[@itemscope]/meta[@itemprop="availabilityEnds"]/@content'
+    }
 ```
 
 I’ve chosen to not build out the scaffolding with comments, but to throw this at you instead.  Let’s walk it through.
diff --git a/website/_containers/scrape/2013-03-06-part-1.md b/website/_containers/scrape/2013-03-06-part-1.md
@@ -50,12 +50,11 @@ Let’s add some items that we actually want to collect. We assign them to `Fiel
 class LivingSocialDeal(Item):
     """Livingsocial container (dictionary-like object) for scraped data"""
     title = Field()
-    description = Field()
     link = Field()
-    category = Field()
     location = Field()
     original_price = Field()
     price = Field()
+    end_date = Field()
 ```
 
 Nothing too hard - that was it. In scrapy, there are no other field types, unlike Django. So, we’re sort of stuck with `Field()`.
@@ -65,15 +64,15 @@ Let’s play around with this in the Python terminal. Make sure your `ScrapeProj
 ```bash
 >>> from scrapy.item import Item, Field
 >>> from items import LivingSocialDeal
->>> deal = LivingSocialDeal(title="$20 off yoga classes", category="health")
+>>> deal = LivingSocialDeal(title="$20 off yoga classes", price="50")
 >>> print deal
-LivingSocialDeal(title='$20 off yoga classes', category='health')
+LivingSocialDeal(title='$20 off yoga classes', price='50')
 >>> deal['title']
 '$20 off yoga classes'
 >>> deal.get('title')
 '$20 off yoga classes'
->>> deal['category']
-'health'
+>>> deal['price']
+'50'
 >>> deal['location'] = "New York"
 >>> deal['location']
 'New York'