From 8b26e928f18ce8031da3c83b01d00344f96edd15 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 25 May 2019 13:12:28 +0300 Subject: [PATCH 01/11] add django on_delete keyword for foreign keys since it's required in new versions (django2) --- dynamic_scraper/models.py | 77 ++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/dynamic_scraper/models.py b/dynamic_scraper/models.py index 21790da6..88034e29 100644 --- a/dynamic_scraper/models.py +++ b/dynamic_scraper/models.py @@ -25,7 +25,7 @@ class ScrapedObjClass(models.Model): "ZERO_ACTIONS_FACTOR_CHANGE": 5,\n\ "FACTOR_CHANGE_FACTOR": 1.3,\n') comments = models.TextField(blank=True) - + def __str__(self): return self.name @@ -46,14 +46,14 @@ class ScrapedObjAttr(models.Model): ) name = models.CharField(max_length=200) order = models.IntegerField(default=100) - obj_class = models.ForeignKey(ScrapedObjClass) + obj_class = models.ForeignKey(ScrapedObjClass, on_delete=models.CASCADE) attr_type = models.CharField(max_length=1, choices=ATTR_TYPE_CHOICES) id_field = models.BooleanField(default=False) save_to_db = models.BooleanField(default=True) - + def __str__(self): return self.name + " (" + str(self.obj_class) + ")" - + class Meta(object): ordering = ['order',] @@ -100,7 +100,7 @@ class Scraper(models.Model): ('O', 'FOLLOW'), ) name = models.CharField(max_length=200) - scraped_obj_class = models.ForeignKey(ScrapedObjClass) + scraped_obj_class = models.ForeignKey(ScrapedObjClass, models.SET_NULL) help_text = "Runtime status of the scraper, used by scheduling mechanism." status = models.CharField(max_length=1, choices=STATUS_CHOICES, default='P', help_text=help_text) help_text = "Internal work/progress status of the scraper." @@ -112,7 +112,7 @@ class Scraper(models.Model): pagination_type = models.CharField(max_length=1, choices=PAGINATION_TYPE, default='N') pagination_on_start = models.BooleanField(default=False) pagination_append_str = models.CharField(max_length=200, blank=True, help_text="Syntax: /somepartofurl/{page}/moreurlstuff.html") - pagination_page_replace = models.TextField(blank=True, + pagination_page_replace = models.TextField(blank=True, help_text="RANGE_FUNCT: uses Python range funct., syntax: [start], stop[, step], FREE_LIST: 'Replace text 1', 'Some other text 2', 'Maybe a number 3', ...") help_text = "Optional, follow links from a single non-paginated or all statically paginated (RANGE_FUNCT, FREE_LIST) main pages" follow_pages_url_xpath = models.TextField(blank=True, help_text=help_text) @@ -120,12 +120,12 @@ class Scraper(models.Model): follow_pages_page_xpath = models.TextField(blank=True, help_text=help_text) help_text = "Optionally limit number of pages to follow (default: follow until XPath fails)" num_pages_follow = models.IntegerField(blank=True, null=True, help_text=help_text) - last_scraper_save_alert_period = models.CharField(max_length=5, blank=True, + last_scraper_save_alert_period = models.CharField(max_length=5, blank=True, help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, \ syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')") next_last_scraper_save_alert = models.DateTimeField(default=datetime.datetime.now, help_text="Next time the last scraper save will be alerted, normally set on management cmd run.",) - last_checker_delete_alert_period = models.CharField(max_length=5, blank=True, + last_checker_delete_alert_period = models.CharField(max_length=5, blank=True, help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, \ syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')") next_last_checker_delete_alert = models.DateTimeField(default=datetime.datetime.now, @@ -133,7 +133,7 @@ class Scraper(models.Model): comments = models.TextField(blank=True) last_scraper_save = models.DateTimeField(null=True, blank=True) last_checker_delete = models.DateTimeField(null=True, blank=True) - + def get_alert_period_timedelta(self, attribute_str): if getattr(self, attribute_str) and len(getattr(self, attribute_str)) >= 2: period_str = getattr(self, attribute_str)[-1] @@ -153,16 +153,16 @@ def get_alert_period_timedelta(self, attribute_str): return None else: return None - + def get_last_scraper_save_alert_period_timedelta(self): return self.get_alert_period_timedelta('last_scraper_save_alert_period') - + def get_last_checker_delete_alert_period_timedelta(self): return self.get_alert_period_timedelta('last_checker_delete_alert_period') - + def get_main_page_rpt(self): return self.requestpagetype_set.get(page_type='MP') - + def get_follow_page_rpts(self): return self.requestpagetype_set.filter(page_type='FP') @@ -177,16 +177,16 @@ def get_rpt_for_scraped_obj_attr(self, soa): def get_base_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='B') - + def get_base_elem(self): return self.scraperelem_set.get(scraped_obj_attr__attr_type='B') - + def get_detail_page_url_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U') def get_detail_page_url_id_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U', scraped_obj_attr__id_field=True) - + def get_standard_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') @@ -204,33 +204,33 @@ def get_standard_update_elems(self): def get_standard_update_elems_from_detail_pages(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='T').filter(~Q(request_page_type='MP')) - + def get_image_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='I') - + def get_image_elem(self): return self.scraperelem_set.get(scraped_obj_attr__attr_type='I') - + def get_scrape_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') q3 = Q(scraped_obj_attr__attr_type='U') q4 = Q(scraped_obj_attr__attr_type='I') return self.scraperelem_set.filter(q1 | q2 | q3 | q4) - + def get_mandatory_scrape_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') q3 = Q(scraped_obj_attr__attr_type='U') q4 = Q(scraped_obj_attr__attr_type='I') return self.scraperelem_set.filter(q1 | q2 | q3 | q4).filter(mandatory=True) - + def get_from_detail_pages_scrape_elems(self): return self.scraperelem_set.filter(~Q(request_page_type='MP')) - + def __str__(self): return self.name + " (" + self.scraped_obj_class.name + ")" - + class Meta(object): ordering = ['name', 'scraped_obj_class',] @@ -253,8 +253,8 @@ class RequestPageType(models.Model): ) help_text = "One main page RPT, an optional follow page RPT (if follow pagination is used) and detail page RPTs for all DETAIL_PAGE_URLs" page_type = models.CharField(max_length=3, choices=TYPE_CHOICES, help_text=help_text) - scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, blank=True, null=True, help_text="Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.") - scraper = models.ForeignKey(Scraper) + scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, blank=True, null=True, help_text="Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.",on_delete=models.SET_NULL) + scraper = models.ForeignKey(Scraper, on_delete=models.CASCADE) content_type = models.CharField(max_length=1, choices=CONTENT_TYPE_CHOICES, default='H', help_text="Data type format for scraped pages of page type (for JSON use JSONPath instead of XPath)") render_javascript = models.BooleanField(default=False, help_text="Render Javascript on pages (ScrapyJS/Splash deployment needed, careful: resource intense)") request_type = models.CharField(max_length=1, choices=REQUEST_TYPE_CHOICES, default='R', help_text="Normal (typically GET) request (default) or form request (typically POST), using Scrapys corresponding request classes (not used for checker).") @@ -280,24 +280,25 @@ class Checker(models.Model): ('4', '404'), ('X', '404_OR_X_PATH'), ) - scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text="Attribute of type DETAIL_PAGE_URL, several checkers for same DETAIL_PAGE_URL attribute possible.") - scraper = models.ForeignKey(Scraper) + scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text="Attribute of type DETAIL_PAGE_URL, several checkers for same DETAIL_PAGE_URL attribute possible.", + on_delete=models.CASCADE) + scraper = models.ForeignKey(Scraper, on_delete=models.CASCADE) checker_type = models.CharField(max_length=1, choices=CHECKER_TYPE, default='4') checker_x_path = models.TextField(blank=True) checker_x_path_result = models.TextField(blank=True) checker_ref_url = models.URLField(max_length=500, blank=True) comments = models.TextField(blank=True) - + def __str__(self): return str(self.scraped_obj_attr) + ' > ' + self.get_checker_type_display() - + @python_2_unicode_compatible class ScraperElem(models.Model): REQUEST_PAGE_TYPE_CHOICES = tuple([("MP", "Main Page")] + [("DP{n}".format(n=str(n)), "Detail Page {n}".format(n=str(n))) for n in list(range(1, 26))]) help_text = "The different attributes to be scraped, exactly one attribute of type BASE necessary." - scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text=help_text) - scraper = models.ForeignKey(Scraper) + scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text=help_text, on_delete=models.CASCADE) + scraper = models.ForeignKey(Scraper, on_delete=models.CASCADE) x_path = models.TextField(blank=True, help_text='XPath or JSONPath expression, leave blank on "static" processor use.') reg_exp = models.TextField(blank=True, help_text="Optional filtering by regular expression (e.g. 'Scrape only (.*) the text in between').") help_text = "Corresponding Request Page Types created for this scraper." @@ -310,16 +311,16 @@ class ScraperElem(models.Model): proc_ctxt = models.TextField(blank=True, help_text=help_text) help_text = "Drop item if attribute could not be scraped." mandatory = models.BooleanField(default=True, help_text=help_text) - + def __str__(self): return '{s} > {soa} Attribute ({rpt})'.format( s=str(self.scraper), soa=self.scraped_obj_attr.name, rpt=self.get_request_page_type_display()) - + class Meta(object): ordering = ['scraped_obj_attr__order',] - + @python_2_unicode_compatible @@ -332,10 +333,10 @@ class SchedulerRuntime(models.Model): next_action_time = models.DateTimeField(default=datetime.datetime.now) next_action_factor = models.FloatField(blank=True, null=True) num_zero_actions = models.IntegerField(default=0) - + def __str__(self): return str(self.id) - + class Meta(object): ordering = ['next_action_time',] @@ -356,7 +357,7 @@ class LogMarker(models.Model): mark_with_type = models.CharField(max_length=2, choices=TYPE_CHOICES, help_text=help_text) custom_type = models.CharField(max_length=25, blank=True) spider_name = models.CharField(max_length=200, blank=True) - scraper = models.ForeignKey(Scraper, blank=True, null=True) + scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.SET_NULL) class Log(models.Model): @@ -372,7 +373,7 @@ class Log(models.Model): type = models.CharField(max_length=25, blank=True) level = models.IntegerField(choices=LEVEL_CHOICES) spider_name = models.CharField(max_length=200) - scraper = models.ForeignKey(Scraper, blank=True, null=True) + scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.SET_NULL) date = models.DateTimeField(default=datetime.datetime.now) @staticmethod From 48eb4c047d511f8272a2a459b0d39a2c97b8d125 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 25 May 2019 13:38:28 +0300 Subject: [PATCH 02/11] add django on_delete keyword for foreign keys since it's required in new versions (django2) --- dynamic_scraper/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dynamic_scraper/models.py b/dynamic_scraper/models.py index 88034e29..2bbfb4be 100644 --- a/dynamic_scraper/models.py +++ b/dynamic_scraper/models.py @@ -100,7 +100,7 @@ class Scraper(models.Model): ('O', 'FOLLOW'), ) name = models.CharField(max_length=200) - scraped_obj_class = models.ForeignKey(ScrapedObjClass, models.SET_NULL) + scraped_obj_class = models.ForeignKey(ScrapedObjClass, models.CASCADE) help_text = "Runtime status of the scraper, used by scheduling mechanism." status = models.CharField(max_length=1, choices=STATUS_CHOICES, default='P', help_text=help_text) help_text = "Internal work/progress status of the scraper." @@ -253,7 +253,7 @@ class RequestPageType(models.Model): ) help_text = "One main page RPT, an optional follow page RPT (if follow pagination is used) and detail page RPTs for all DETAIL_PAGE_URLs" page_type = models.CharField(max_length=3, choices=TYPE_CHOICES, help_text=help_text) - scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, blank=True, null=True, help_text="Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.",on_delete=models.SET_NULL) + scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, blank=True, null=True, help_text="Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.",on_delete=models.CASCADE) scraper = models.ForeignKey(Scraper, on_delete=models.CASCADE) content_type = models.CharField(max_length=1, choices=CONTENT_TYPE_CHOICES, default='H', help_text="Data type format for scraped pages of page type (for JSON use JSONPath instead of XPath)") render_javascript = models.BooleanField(default=False, help_text="Render Javascript on pages (ScrapyJS/Splash deployment needed, careful: resource intense)") @@ -357,7 +357,7 @@ class LogMarker(models.Model): mark_with_type = models.CharField(max_length=2, choices=TYPE_CHOICES, help_text=help_text) custom_type = models.CharField(max_length=25, blank=True) spider_name = models.CharField(max_length=200, blank=True) - scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.SET_NULL) + scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.CASCADE) class Log(models.Model): @@ -373,7 +373,7 @@ class Log(models.Model): type = models.CharField(max_length=25, blank=True) level = models.IntegerField(choices=LEVEL_CHOICES) spider_name = models.CharField(max_length=200) - scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.SET_NULL) + scraper = models.ForeignKey(Scraper, blank=True, null=True, on_delete=models.CASCADE) date = models.DateTimeField(default=datetime.datetime.now) @staticmethod From 9c3ee8a4dd9baf30c4747841aa6bef44f60cc0b8 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 25 May 2019 14:12:49 +0300 Subject: [PATCH 03/11] change migration scripts to be generated by django2 --- dynamic_scraper/migrations/0001_initial.py | 163 ++++++++++------ .../0002_scraper_render_javascript.py | 19 -- .../migrations/0003_auto_20150610_0906.py | 24 --- .../0004_scrapedobjattr_id_field.py | 27 --- .../0005_new_dict_params_for_scraper.py | 39 ---- .../migrations/0006_request_type_and_body.py | 49 ----- .../migrations/0007_dont_filter_attribute.py | 19 -- .../0008_new_request_page_types_construct.py | 73 -------- ...legacy_request_page_type_scraper_fields.py | 62 ------- ...010_move_save_to_db_to_scraped_obj_attr.py | 23 --- ...checker_attributes_to_own_checker_class.py | 56 ------ ...moved_legacy_checker_scraper_attributes.py | 30 --- ...save_and_checker_delete_datetime_fields.py | 28 --- ..._delete_alert_period_fields_for_scraper.py | 24 --- ...t_scraper_save_and_checker_delete_alert.py | 35 ---- ...ype_for_x_path_reg_exp_processor_fields.py | 44 ----- .../0017_added_order_to_scraped_obj_attr.py | 27 --- ...se_default_procs_field_to_scraper_elems.py | 56 ------ ...ated_help_texts_for_request_info_fields.py | 35 ---- ...0020_added_work_status_field_to_scraper.py | 20 -- .../0021_added_owner_field_to_scraper.py | 30 --- ...22_added_option_for_scraper_work_status.py | 20 -- ...pages_follow_atts_to_scraper_pagination.py | 30 --- ...024_new_follow_page_type_choice_for_rpt.py | 20 -- ...w_pages_page_xpath_pagination_attribute.py | 45 ----- .../south_migrations/0001_initial.py | 174 ------------------ ...02_auto__add_field_scraper_content_type.py | 88 --------- ...auto__add_logmarker__add_field_log_type.py | 115 ------------ ...auto__chg_field_scraper_checker_ref_url.py | 98 ---------- 29 files changed, 109 insertions(+), 1364 deletions(-) delete mode 100644 dynamic_scraper/migrations/0002_scraper_render_javascript.py delete mode 100644 dynamic_scraper/migrations/0003_auto_20150610_0906.py delete mode 100644 dynamic_scraper/migrations/0004_scrapedobjattr_id_field.py delete mode 100644 dynamic_scraper/migrations/0005_new_dict_params_for_scraper.py delete mode 100644 dynamic_scraper/migrations/0006_request_type_and_body.py delete mode 100644 dynamic_scraper/migrations/0007_dont_filter_attribute.py delete mode 100644 dynamic_scraper/migrations/0008_new_request_page_types_construct.py delete mode 100644 dynamic_scraper/migrations/0009_removed_legacy_request_page_type_scraper_fields.py delete mode 100644 dynamic_scraper/migrations/0010_move_save_to_db_to_scraped_obj_attr.py delete mode 100644 dynamic_scraper/migrations/0011_extracted_checker_attributes_to_own_checker_class.py delete mode 100644 dynamic_scraper/migrations/0012_removed_legacy_checker_scraper_attributes.py delete mode 100644 dynamic_scraper/migrations/0013_added_scraper_save_and_checker_delete_datetime_fields.py delete mode 100644 dynamic_scraper/migrations/0014_added_scraper_save_and_checker_delete_alert_period_fields_for_scraper.py delete mode 100644 dynamic_scraper/migrations/0015_added_datetime_fields_for_last_scraper_save_and_checker_delete_alert.py delete mode 100644 dynamic_scraper/migrations/0016_optional_xpath_fields_text_type_for_x_path_reg_exp_processor_fields.py delete mode 100644 dynamic_scraper/migrations/0017_added_order_to_scraped_obj_attr.py delete mode 100644 dynamic_scraper/migrations/0018_added_use_default_procs_field_to_scraper_elems.py delete mode 100644 dynamic_scraper/migrations/0019_updated_help_texts_for_request_info_fields.py delete mode 100644 dynamic_scraper/migrations/0020_added_work_status_field_to_scraper.py delete mode 100644 dynamic_scraper/migrations/0021_added_owner_field_to_scraper.py delete mode 100644 dynamic_scraper/migrations/0022_added_option_for_scraper_work_status.py delete mode 100644 dynamic_scraper/migrations/0023_added_follow_pages_by_xpath_and_num_pages_follow_atts_to_scraper_pagination.py delete mode 100644 dynamic_scraper/migrations/0024_new_follow_page_type_choice_for_rpt.py delete mode 100644 dynamic_scraper/migrations/0025_new_follow_pages_page_xpath_pagination_attribute.py delete mode 100644 dynamic_scraper/south_migrations/0001_initial.py delete mode 100644 dynamic_scraper/south_migrations/0002_auto__add_field_scraper_content_type.py delete mode 100644 dynamic_scraper/south_migrations/0003_auto__add_logmarker__add_field_log_type.py delete mode 100644 dynamic_scraper/south_migrations/0004_auto__chg_field_scraper_checker_ref_url.py diff --git a/dynamic_scraper/migrations/0001_initial.py b/dynamic_scraper/migrations/0001_initial.py index 4b88f025..fb87eec1 100644 --- a/dynamic_scraper/migrations/0001_initial.py +++ b/dynamic_scraper/migrations/0001_initial.py @@ -1,143 +1,198 @@ # -*- coding: utf-8 -*- +# Generated by Django 1.11.20 on 2019-05-25 06:05 from __future__ import unicode_literals -from django.db import models, migrations import datetime +from django.db import migrations, models +import django.db.models.deletion class Migration(migrations.Migration): + initial = True + dependencies = [ ] operations = [ + migrations.CreateModel( + name='Checker', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('checker_type', models.CharField(choices=[('4', '404'), ('X', '404_OR_X_PATH')], default='4', max_length=1)), + ('checker_x_path', models.TextField(blank=True)), + ('checker_x_path_result', models.TextField(blank=True)), + ('checker_ref_url', models.URLField(blank=True, max_length=500)), + ('comments', models.TextField(blank=True)), + ], + ), migrations.CreateModel( name='Log', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('message', models.CharField(max_length=255)), ('ref_object', models.CharField(max_length=200)), - ('type', models.CharField(max_length=25, blank=True)), - ('level', models.IntegerField(choices=[(50, b'CRITICAL'), (40, b'ERROR'), (30, b'WARNING'), (20, b'INFO'), (10, b'DEBUG')])), + ('type', models.CharField(blank=True, max_length=25)), + ('level', models.IntegerField(choices=[(50, 'CRITICAL'), (40, 'ERROR'), (30, 'WARNING'), (20, 'INFO'), (10, 'DEBUG')])), ('spider_name', models.CharField(max_length=200)), ('date', models.DateTimeField(default=datetime.datetime.now)), ], options={ 'ordering': ['-date'], }, - bases=(models.Model,), ), migrations.CreateModel( name='LogMarker', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('message_contains', models.CharField(max_length=255)), - ('ref_object', models.CharField(max_length=200, blank=True)), - ('mark_with_type', models.CharField(help_text=b'Choose "Custom" and enter your own type in the next field for a custom type', max_length=2, choices=[(b'PE', b'Planned Error'), (b'DD', b'Dirty Data'), (b'IM', b'Important'), (b'IG', b'Ignore'), (b'MI', b'Miscellaneous'), (b'CU', b'Custom')])), - ('custom_type', models.CharField(max_length=25, blank=True)), - ('spider_name', models.CharField(max_length=200, blank=True)), + ('ref_object', models.CharField(blank=True, max_length=200)), + ('mark_with_type', models.CharField(choices=[('PE', 'Planned Error'), ('DD', 'Dirty Data'), ('IM', 'Important'), ('IG', 'Ignore'), ('MI', 'Miscellaneous'), ('CU', 'Custom')], help_text='Choose "Custom" and enter your own type in the next field for a custom type', max_length=2)), + ('custom_type', models.CharField(blank=True, max_length=25)), + ('spider_name', models.CharField(blank=True, max_length=200)), + ], + ), + migrations.CreateModel( + name='RequestPageType', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('page_type', models.CharField(choices=[('MP', 'Main Page'), ('FP', 'Follow Page'), ('DP1', 'Detail Page 1'), ('DP2', 'Detail Page 2'), ('DP3', 'Detail Page 3'), ('DP4', 'Detail Page 4'), ('DP5', 'Detail Page 5'), ('DP6', 'Detail Page 6'), ('DP7', 'Detail Page 7'), ('DP8', 'Detail Page 8'), ('DP9', 'Detail Page 9'), ('DP10', 'Detail Page 10'), ('DP11', 'Detail Page 11'), ('DP12', 'Detail Page 12'), ('DP13', 'Detail Page 13'), ('DP14', 'Detail Page 14'), ('DP15', 'Detail Page 15'), ('DP16', 'Detail Page 16'), ('DP17', 'Detail Page 17'), ('DP18', 'Detail Page 18'), ('DP19', 'Detail Page 19'), ('DP20', 'Detail Page 20'), ('DP21', 'Detail Page 21'), ('DP22', 'Detail Page 22'), ('DP23', 'Detail Page 23'), ('DP24', 'Detail Page 24'), ('DP25', 'Detail Page 25')], help_text='One main page RPT, an optional follow page RPT (if follow pagination is used) and detail page RPTs for all DETAIL_PAGE_URLs', max_length=3)), + ('content_type', models.CharField(choices=[('H', 'HTML'), ('X', 'XML'), ('J', 'JSON')], default='H', help_text='Data type format for scraped pages of page type (for JSON use JSONPath instead of XPath)', max_length=1)), + ('render_javascript', models.BooleanField(default=False, help_text='Render Javascript on pages (ScrapyJS/Splash deployment needed, careful: resource intense)')), + ('request_type', models.CharField(choices=[('R', 'Request'), ('F', 'FormRequest')], default='R', help_text='Normal (typically GET) request (default) or form request (typically POST), using Scrapys corresponding request classes (not used for checker).', max_length=1)), + ('method', models.CharField(choices=[('GET', 'GET'), ('POST', 'POST')], default='GET', help_text='HTTP request via GET or POST.', max_length=10)), + ('headers', models.TextField(blank=True, help_text='Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)), can use {main page attribute}, {page} and {follow_page} placeholders.')), + ('body', models.TextField(blank=True, help_text='Optional HTTP message body provided as a unicode string, can use {main page attribute}, {page} and {follow_page} placeholders.')), + ('cookies', models.TextField(blank=True, help_text='Optional cookies as JSON dict (use double quotes!), can use {main page attribute}, {page} and {follow_page} placeholders.')), + ('meta', models.TextField(blank=True, help_text='Optional Scrapy meta attributes as JSON dict (use double quotes!), see Scrapy docs for reference.')), + ('form_data', models.TextField(blank=True, help_text='Optional HTML form data as JSON dict (use double quotes!), only used with FormRequest request type, can use {main page attribute}, {page} and {follow_page} placeholders.')), + ('dont_filter', models.BooleanField(default=False, help_text='Do not filter duplicate requests, useful for some scenarios with requests falsely marked as being duplicate (e.g. uniform URL + pagination by HTTP header).')), + ('comments', models.TextField(blank=True)), ], - options={ - }, - bases=(models.Model,), ), migrations.CreateModel( name='SchedulerRuntime', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('runtime_type', models.CharField(default=b'P', max_length=1, choices=[(b'S', b'SCRAPER'), (b'C', b'CHECKER')])), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('runtime_type', models.CharField(choices=[('S', 'SCRAPER'), ('C', 'CHECKER')], default='P', max_length=1)), ('next_action_time', models.DateTimeField(default=datetime.datetime.now)), - ('next_action_factor', models.FloatField(null=True, blank=True)), + ('next_action_factor', models.FloatField(blank=True, null=True)), ('num_zero_actions', models.IntegerField(default=0)), ], options={ 'ordering': ['next_action_time'], }, - bases=(models.Model,), ), migrations.CreateModel( name='ScrapedObjAttr', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('name', models.CharField(max_length=200)), - ('attr_type', models.CharField(max_length=1, choices=[(b'S', b'STANDARD'), (b'T', b'STANDARD (UPDATE)'), (b'B', b'BASE'), (b'U', b'DETAIL_PAGE_URL'), (b'I', b'IMAGE')])), + ('order', models.IntegerField(default=100)), + ('attr_type', models.CharField(choices=[('S', 'STANDARD'), ('T', 'STANDARD (UPDATE)'), ('B', 'BASE'), ('U', 'DETAIL_PAGE_URL'), ('I', 'IMAGE')], max_length=1)), + ('id_field', models.BooleanField(default=False)), + ('save_to_db', models.BooleanField(default=True)), ], options={ + 'ordering': ['order'], }, - bases=(models.Model,), ), migrations.CreateModel( name='ScrapedObjClass', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('name', models.CharField(max_length=200)), - ('scraper_scheduler_conf', models.TextField(default=b'"MIN_TIME": 15,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 10,\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), - ('checker_scheduler_conf', models.TextField(default=b'"MIN_TIME": 1440,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 1,\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), + ('scraper_scheduler_conf', models.TextField(default='"MIN_TIME": 15,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 10,\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), + ('checker_scheduler_conf', models.TextField(default='"MIN_TIME": 1440,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 1,\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), ('comments', models.TextField(blank=True)), ], options={ + 'verbose_name': 'Scraped object class', + 'verbose_name_plural': 'Scraped object classes', 'ordering': ['name'], }, - bases=(models.Model,), ), migrations.CreateModel( name='Scraper', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('name', models.CharField(max_length=200)), - ('status', models.CharField(default=b'P', max_length=1, choices=[(b'A', b'ACTIVE'), (b'M', b'MANUAL'), (b'P', b'PAUSED'), (b'I', b'INACTIVE')])), - ('content_type', models.CharField(default=b'H', max_length=1, choices=[(b'H', b'HTML'), (b'X', b'XML')])), - ('max_items_read', models.IntegerField(help_text=b'Max number of items to be read (empty: unlimited).', null=True, blank=True)), - ('max_items_save', models.IntegerField(help_text=b'Max number of items to be saved (empty: unlimited).', null=True, blank=True)), - ('pagination_type', models.CharField(default=b'N', max_length=1, choices=[(b'N', b'NONE'), (b'R', b'RANGE_FUNCT'), (b'F', b'FREE_LIST')])), + ('status', models.CharField(choices=[('A', 'ACTIVE'), ('M', 'MANUAL'), ('P', 'PAUSED'), ('I', 'INACTIVE')], default='P', help_text='Runtime status of the scraper, used by scheduling mechanism.', max_length=1)), + ('work_status', models.CharField(choices=[('R2', 'REVISION NEEDED (MAJOR)'), ('R1', 'REVISION NEEDED (MINOR)'), ('UR', 'UNRESOLVED'), ('BR', 'BROKEN'), ('W', 'WORKING'), ('RC', 'RELEASE CANDIDATE'), ('BE', 'BETA'), ('A', 'ALPHA'), ('D', 'DRAFT'), ('S', 'SUSPENDED'), ('U', 'UNKNOWN'), ('N', 'NOT SET')], default='N', help_text='Internal work/progress status of the scraper.', max_length=2)), + ('owner', models.CharField(blank=True, help_text='Optional owner when working on scrapers with various people', max_length=12)), + ('max_items_read', models.IntegerField(blank=True, help_text='Max number of items to be read (empty: unlimited).', null=True)), + ('max_items_save', models.IntegerField(blank=True, help_text='Max number of items to be saved (empty: unlimited).', null=True)), + ('pagination_type', models.CharField(choices=[('N', 'NONE'), ('R', 'RANGE_FUNCT (+FOLLOW)'), ('F', 'FREE_LIST (+FOLLOW)'), ('O', 'FOLLOW')], default='N', max_length=1)), ('pagination_on_start', models.BooleanField(default=False)), - ('pagination_append_str', models.CharField(help_text=b'Syntax: /somepartofurl/{page}/moreurlstuff.html', max_length=200, blank=True)), - ('pagination_page_replace', models.TextField(help_text=b"RANGE_FUNCT: uses Python range funct., syntax: [start], stop[, step], FREE_LIST: 'Replace text 1', 'Some other text 2', 'Maybe a number 3', ...", blank=True)), - ('checker_type', models.CharField(default=b'N', max_length=1, choices=[(b'N', b'NONE'), (b'4', b'404'), (b'X', b'404_OR_X_PATH')])), - ('checker_x_path', models.CharField(max_length=200, blank=True)), - ('checker_x_path_result', models.CharField(max_length=200, blank=True)), - ('checker_ref_url', models.URLField(max_length=500, blank=True)), + ('pagination_append_str', models.CharField(blank=True, help_text='Syntax: /somepartofurl/{page}/moreurlstuff.html', max_length=200)), + ('pagination_page_replace', models.TextField(blank=True, help_text="RANGE_FUNCT: uses Python range funct., syntax: [start], stop[, step], FREE_LIST: 'Replace text 1', 'Some other text 2', 'Maybe a number 3', ...")), + ('follow_pages_url_xpath', models.TextField(blank=True, help_text='Optional, follow links from a single non-paginated or all statically paginated (RANGE_FUNCT, FREE_LIST) main pages')), + ('follow_pages_page_xpath', models.TextField(blank=True, help_text='Optional additional XPath for the page number, can be used in {follow_page} placeholder.')), + ('num_pages_follow', models.IntegerField(blank=True, help_text='Optionally limit number of pages to follow (default: follow until XPath fails)', null=True)), + ('last_scraper_save_alert_period', models.CharField(blank=True, help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')", max_length=5)), + ('next_last_scraper_save_alert', models.DateTimeField(default=datetime.datetime.now, help_text='Next time the last scraper save will be alerted, normally set on management cmd run.')), + ('last_checker_delete_alert_period', models.CharField(blank=True, help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')", max_length=5)), + ('next_last_checker_delete_alert', models.DateTimeField(default=datetime.datetime.now, help_text='Next time the last checker delete will be alerted, normally set on management cmd run.')), ('comments', models.TextField(blank=True)), - ('scraped_obj_class', models.ForeignKey(to='dynamic_scraper.ScrapedObjClass')), + ('last_scraper_save', models.DateTimeField(blank=True, null=True)), + ('last_checker_delete', models.DateTimeField(blank=True, null=True)), + ('scraped_obj_class', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjClass')), ], options={ 'ordering': ['name', 'scraped_obj_class'], }, - bases=(models.Model,), ), migrations.CreateModel( name='ScraperElem', fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('x_path', models.CharField(max_length=200)), - ('reg_exp', models.CharField(max_length=200, blank=True)), - ('from_detail_page', models.BooleanField(default=False)), - ('processors', models.CharField(max_length=200, blank=True)), - ('proc_ctxt', models.CharField(max_length=200, blank=True)), - ('mandatory', models.BooleanField(default=True)), - ('scraped_obj_attr', models.ForeignKey(to='dynamic_scraper.ScrapedObjAttr')), - ('scraper', models.ForeignKey(to='dynamic_scraper.Scraper')), + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('x_path', models.TextField(blank=True, help_text='XPath or JSONPath expression, leave blank on "static" processor use.')), + ('reg_exp', models.TextField(blank=True, help_text="Optional filtering by regular expression (e.g. 'Scrape only (.*) the text in between').")), + ('request_page_type', models.CharField(choices=[('MP', 'Main Page'), ('DP1', 'Detail Page 1'), ('DP2', 'Detail Page 2'), ('DP3', 'Detail Page 3'), ('DP4', 'Detail Page 4'), ('DP5', 'Detail Page 5'), ('DP6', 'Detail Page 6'), ('DP7', 'Detail Page 7'), ('DP8', 'Detail Page 8'), ('DP9', 'Detail Page 9'), ('DP10', 'Detail Page 10'), ('DP11', 'Detail Page 11'), ('DP12', 'Detail Page 12'), ('DP13', 'Detail Page 13'), ('DP14', 'Detail Page 14'), ('DP15', 'Detail Page 15'), ('DP16', 'Detail Page 16'), ('DP17', 'Detail Page 17'), ('DP18', 'Detail Page 18'), ('DP19', 'Detail Page 19'), ('DP20', 'Detail Page 20'), ('DP21', 'Detail Page 21'), ('DP22', 'Detail Page 22'), ('DP23', 'Detail Page 23'), ('DP24', 'Detail Page 24'), ('DP25', 'Detail Page 25')], default='MP', help_text='Corresponding Request Page Types created for this scraper.', max_length=3)), + ('use_default_procs', models.BooleanField(default=True, help_text='Use the default processors (Scrapy TakeFirst, DDS string_strip) for convenience.')), + ('processors', models.TextField(blank=True, help_text='Optional comma-separated list of processors used (e.g. "pre_url, post_string").')), + ('proc_ctxt', models.TextField(blank=True, help_text="Comma-separated aditional context (depending on processor) (e.g. 'pre_url': 'http://append_before.org/', 'post_string': '?append_after=True').")), + ('mandatory', models.BooleanField(default=True, help_text='Drop item if attribute could not be scraped.')), + ('scraped_obj_attr', models.ForeignKey(help_text='The different attributes to be scraped, exactly one attribute of type BASE necessary.', on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjAttr')), + ('scraper', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.Scraper')), ], options={ + 'ordering': ['scraped_obj_attr__order'], }, - bases=(models.Model,), ), migrations.AddField( model_name='scrapedobjattr', name='obj_class', - field=models.ForeignKey(to='dynamic_scraper.ScrapedObjClass'), - preserve_default=True, + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjClass'), + ), + migrations.AddField( + model_name='requestpagetype', + name='scraped_obj_attr', + field=models.ForeignKey(blank=True, help_text='Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.', null=True, on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjAttr'), + ), + migrations.AddField( + model_name='requestpagetype', + name='scraper', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.Scraper'), ), migrations.AddField( model_name='logmarker', name='scraper', - field=models.ForeignKey(blank=True, to='dynamic_scraper.Scraper', null=True), - preserve_default=True, + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.Scraper'), ), migrations.AddField( model_name='log', name='scraper', - field=models.ForeignKey(blank=True, to='dynamic_scraper.Scraper', null=True), - preserve_default=True, + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.Scraper'), + ), + migrations.AddField( + model_name='checker', + name='scraped_obj_attr', + field=models.ForeignKey(help_text='Attribute of type DETAIL_PAGE_URL, several checkers for same DETAIL_PAGE_URL attribute possible.', on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjAttr'), + ), + migrations.AddField( + model_name='checker', + name='scraper', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.Scraper'), ), ] diff --git a/dynamic_scraper/migrations/0002_scraper_render_javascript.py b/dynamic_scraper/migrations/0002_scraper_render_javascript.py deleted file mode 100644 index d94dc4ad..00000000 --- a/dynamic_scraper/migrations/0002_scraper_render_javascript.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0001_initial'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='render_javascript', - field=models.BooleanField(default=False, help_text=b'Render Javascript on pages (ScrapyJS/Splash deployment needed, careful: resource intense)'), - ), - ] diff --git a/dynamic_scraper/migrations/0003_auto_20150610_0906.py b/dynamic_scraper/migrations/0003_auto_20150610_0906.py deleted file mode 100644 index 87c8cbdf..00000000 --- a/dynamic_scraper/migrations/0003_auto_20150610_0906.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0002_scraper_render_javascript'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='detail_page_content_type', - field=models.CharField(default=b'H', help_text=b'Data type format for detail pages and checker (for JSON use JSONPath instead of XPath)', max_length=1, choices=[(b'H', b'HTML'), (b'X', b'XML'), (b'J', b'JSON')]), - ), - migrations.AlterField( - model_name='scraper', - name='content_type', - field=models.CharField(default=b'H', help_text=b'Data type format for scraped main pages (for JSON use JSONPath instead of XPath)', max_length=1, choices=[(b'H', b'HTML'), (b'X', b'XML'), (b'J', b'JSON')]), - ), - ] diff --git a/dynamic_scraper/migrations/0004_scrapedobjattr_id_field.py b/dynamic_scraper/migrations/0004_scrapedobjattr_id_field.py deleted file mode 100644 index 6a72a0d6..00000000 --- a/dynamic_scraper/migrations/0004_scrapedobjattr_id_field.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -def add_id_to_detail_page_url_scraped_obj_attributes(apps, schema_editor): - ScrapedObjAttr = apps.get_model("dynamic_scraper", "ScrapedObjAttr") - for soa in ScrapedObjAttr.objects.all(): - if soa.attr_type == 'U': - soa.id_field = True - soa.save() - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0003_auto_20150610_0906'), - ] - - operations = [ - migrations.AddField( - model_name='scrapedobjattr', - name='id_field', - field=models.BooleanField(default=False), - ), - migrations.RunPython(add_id_to_detail_page_url_scraped_obj_attributes) - ] diff --git a/dynamic_scraper/migrations/0005_new_dict_params_for_scraper.py b/dynamic_scraper/migrations/0005_new_dict_params_for_scraper.py deleted file mode 100644 index b4f2995d..00000000 --- a/dynamic_scraper/migrations/0005_new_dict_params_for_scraper.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0004_scrapedobjattr_id_field'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='cookies', - field=models.TextField(help_text=b'Optional cookies as JSON dict, can use {page} placeholder of pagination.', blank=True), - ), - migrations.AddField( - model_name='scraper', - name='form_data', - field=models.TextField(help_text=b'Optional HTML form data as JSON dict, only used with FormRequest request type, can use {page} placeholder of pagination.', blank=True), - ), - migrations.AddField( - model_name='scraper', - name='headers', - field=models.TextField(help_text=b"Optional HTTP headers sent with each request - provided as a JSON dict (e.g. {'Referer':'http://referer_url'})).", blank=True), - ), - migrations.AddField( - model_name='scraper', - name='meta', - field=models.TextField(help_text=b'Optional Scrapy meta attributes as JSON dict, see Scrapy docs for reference.', blank=True), - ), - migrations.AddField( - model_name='scraper', - name='request_type', - field=models.CharField(default=b'R', help_text=b'Normal GET request (default) or form request via POST, using Scrapys corresponding request classes (not used for checker).', max_length=1, choices=[(b'R', b'Request (GET)'), (b'F', b'FormRequest (POST)')]), - ), - ] diff --git a/dynamic_scraper/migrations/0006_request_type_and_body.py b/dynamic_scraper/migrations/0006_request_type_and_body.py deleted file mode 100644 index 3aa52679..00000000 --- a/dynamic_scraper/migrations/0006_request_type_and_body.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0005_new_dict_params_for_scraper'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='body', - field=models.TextField(help_text=b'Optional HTTP message body provided as a unicode string, can use {page} placeholder of pagination.', blank=True), - ), - migrations.AddField( - model_name='scraper', - name='method', - field=models.CharField(default=b'GET', help_text=b'HTTP request via GET or POST.', max_length=10, choices=[(b'GET', b'GET'), (b'POST', b'POST')]), - ), - migrations.AlterField( - model_name='scraper', - name='cookies', - field=models.TextField(help_text=b'Optional cookies as JSON dict (use double quotes!), can use {page} placeholder of pagination.', blank=True), - ), - migrations.AlterField( - model_name='scraper', - name='form_data', - field=models.TextField(help_text=b'Optional HTML form data as JSON dict (use double quotes!), only used with FormRequest request type, can use {page} placeholder of pagination.', blank=True), - ), - migrations.AlterField( - model_name='scraper', - name='headers', - field=models.TextField(help_text=b'Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)), can use {page} placeholder of pagination.', blank=True), - ), - migrations.AlterField( - model_name='scraper', - name='meta', - field=models.TextField(help_text=b'Optional Scrapy meta attributes as JSON dict (use double quotes!), see Scrapy docs for reference.', blank=True), - ), - migrations.AlterField( - model_name='scraper', - name='request_type', - field=models.CharField(default=b'R', help_text=b'Normal (typically GET) request (default) or form request (typically POST), using Scrapys corresponding request classes (not used for checker).', max_length=1, choices=[(b'R', b'Request'), (b'F', b'FormRequest')]), - ), - ] diff --git a/dynamic_scraper/migrations/0007_dont_filter_attribute.py b/dynamic_scraper/migrations/0007_dont_filter_attribute.py deleted file mode 100644 index a74bca1a..00000000 --- a/dynamic_scraper/migrations/0007_dont_filter_attribute.py +++ /dev/null @@ -1,19 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0006_request_type_and_body'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='dont_filter', - field=models.BooleanField(default=False, help_text=b'Do not filter duplicate requests, useful for some scenarios with requests falsely marked as being duplicate (e.g. uniform URL + pagination by HTTP header).'), - ), - ] diff --git a/dynamic_scraper/migrations/0008_new_request_page_types_construct.py b/dynamic_scraper/migrations/0008_new_request_page_types_construct.py deleted file mode 100644 index 8c437d7a..00000000 --- a/dynamic_scraper/migrations/0008_new_request_page_types_construct.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -def move_from_detail_page_to_request_page_type(apps, schema_editor): - ScraperElem = apps.get_model("dynamic_scraper", "ScraperElem") - for se in ScraperElem.objects.all(): - if se.from_detail_page: - se.request_page_type = 'DP1' - else: - se.request_page_type = 'MP' - se.save() - -def create_default_request_page_type_objects(apps, schema_editor): - Scraper = apps.get_model("dynamic_scraper", "Scraper") - RequestPageType = apps.get_model("dynamic_scraper", "RequestPageType") - for scraper in Scraper.objects.all(): - rpt_main = RequestPageType(page_type='MP', scraper=scraper, content_type=scraper.content_type, \ - render_javascript=scraper.render_javascript, request_type=scraper.request_type, method=scraper.method, \ - headers=scraper.headers, body=scraper.body, cookies=scraper.cookies, meta=scraper.meta, \ - form_data=scraper.form_data, dont_filter=scraper.dont_filter) - rpt_main.save() - - dpu_elems = scraper.scraperelem_set.filter(scraped_obj_attr__attr_type='U') - if len(dpu_elems) > 0: - dpu_elem = dpu_elems[0] - rpt_dp = RequestPageType(page_type='DP1', scraper=scraper, content_type=scraper.detail_page_content_type, \ - scraped_obj_attr=dpu_elem.scraped_obj_attr, render_javascript=scraper.render_javascript, \ - headers=scraper.headers, body=scraper.body, cookies=scraper.cookies, meta=scraper.meta,) - rpt_dp.save() - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0007_dont_filter_attribute'), - ] - - operations = [ - migrations.CreateModel( - name='RequestPageType', - fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('page_type', models.CharField(max_length=3, choices=[(b'MP', b'Main Page'), (b'DP1', b'Detail Page 1'), (b'DP2', b'Detail Page 2'), (b'DP3', b'Detail Page 3'), (b'DP4', b'Detail Page 4'), (b'DP5', b'Detail Page 5'), (b'DP6', b'Detail Page 6'), (b'DP7', b'Detail Page 7'), (b'DP8', b'Detail Page 8'), (b'DP9', b'Detail Page 9'), (b'DP10', b'Detail Page 10'), (b'DP11', b'Detail Page 11'), (b'DP12', b'Detail Page 12'), (b'DP13', b'Detail Page 13'), (b'DP14', b'Detail Page 14'), (b'DP15', b'Detail Page 15'), (b'DP16', b'Detail Page 16'), (b'DP17', b'Detail Page 17'), (b'DP18', b'Detail Page 18'), (b'DP19', b'Detail Page 19'), (b'DP20', b'Detail Page 20'), (b'DP21', b'Detail Page 21'), (b'DP22', b'Detail Page 22'), (b'DP23', b'Detail Page 23'), (b'DP24', b'Detail Page 24'), (b'DP25', b'Detail Page 25')])), - ('content_type', models.CharField(default=b'H', help_text=b'Data type format for scraped pages of page type (for JSON use JSONPath instead of XPath)', max_length=1, choices=[(b'H', b'HTML'), (b'X', b'XML'), (b'J', b'JSON')])), - ('render_javascript', models.BooleanField(default=False, help_text=b'Render Javascript on pages (ScrapyJS/Splash deployment needed, careful: resource intense)')), - ('request_type', models.CharField(default=b'R', help_text=b'Normal (typically GET) request (default) or form request (typically POST), using Scrapys corresponding request classes (not used for checker).', max_length=1, choices=[(b'R', b'Request'), (b'F', b'FormRequest')])), - ('method', models.CharField(default=b'GET', help_text=b'HTTP request via GET or POST.', max_length=10, choices=[(b'GET', b'GET'), (b'POST', b'POST')])), - ('headers', models.TextField(help_text=b'Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)), can use {page} placeholder of pagination.', blank=True)), - ('body', models.TextField(help_text=b'Optional HTTP message body provided as a unicode string, can use {page} placeholder of pagination.', blank=True)), - ('cookies', models.TextField(help_text=b'Optional cookies as JSON dict (use double quotes!), can use {page} placeholder of pagination.', blank=True)), - ('meta', models.TextField(help_text=b'Optional Scrapy meta attributes as JSON dict (use double quotes!), see Scrapy docs for reference.', blank=True)), - ('form_data', models.TextField(help_text=b'Optional HTML form data as JSON dict (use double quotes!), only used with FormRequest request type, can use {page} placeholder of pagination.', blank=True)), - ('dont_filter', models.BooleanField(default=False, help_text=b'Do not filter duplicate requests, useful for some scenarios with requests falsely marked as being duplicate (e.g. uniform URL + pagination by HTTP header).')), - ('scraped_obj_attr', models.ForeignKey(blank=True, to='dynamic_scraper.ScrapedObjAttr', help_text=b'Empty for main page, attribute of type URL scraped from main page for detail pages.', null=True)), - ('scraper', models.ForeignKey(to='dynamic_scraper.Scraper')), - ], - ), - migrations.AddField( - model_name='scraperelem', - name='request_page_type', - field=models.CharField(default=b'MP', max_length=3, choices=[(b'MP', b'Main Page'), (b'DP1', b'Detail Page 1'), (b'DP2', b'Detail Page 2'), (b'DP3', b'Detail Page 3'), (b'DP4', b'Detail Page 4'), (b'DP5', b'Detail Page 5'), (b'DP6', b'Detail Page 6'), (b'DP7', b'Detail Page 7'), (b'DP8', b'Detail Page 8'), (b'DP9', b'Detail Page 9'), (b'DP10', b'Detail Page 10'), (b'DP11', b'Detail Page 11'), (b'DP12', b'Detail Page 12'), (b'DP13', b'Detail Page 13'), (b'DP14', b'Detail Page 14'), (b'DP15', b'Detail Page 15'), (b'DP16', b'Detail Page 16'), (b'DP17', b'Detail Page 17'), (b'DP18', b'Detail Page 18'), (b'DP19', b'Detail Page 19'), (b'DP20', b'Detail Page 20'), (b'DP21', b'Detail Page 21'), (b'DP22', b'Detail Page 22'), (b'DP23', b'Detail Page 23'), (b'DP24', b'Detail Page 24'), (b'DP25', b'Detail Page 25')]), - ), - migrations.AddField( - model_name='scraperelem', - name='save_to_db', - field=models.BooleanField(default=True), - ), - migrations.RunPython(move_from_detail_page_to_request_page_type), - migrations.RunPython(create_default_request_page_type_objects), - ] diff --git a/dynamic_scraper/migrations/0009_removed_legacy_request_page_type_scraper_fields.py b/dynamic_scraper/migrations/0009_removed_legacy_request_page_type_scraper_fields.py deleted file mode 100644 index 49137a81..00000000 --- a/dynamic_scraper/migrations/0009_removed_legacy_request_page_type_scraper_fields.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0008_new_request_page_types_construct'), - ] - - operations = [ - migrations.RemoveField( - model_name='scraper', - name='body', - ), - migrations.RemoveField( - model_name='scraper', - name='content_type', - ), - migrations.RemoveField( - model_name='scraper', - name='cookies', - ), - migrations.RemoveField( - model_name='scraper', - name='detail_page_content_type', - ), - migrations.RemoveField( - model_name='scraper', - name='dont_filter', - ), - migrations.RemoveField( - model_name='scraper', - name='form_data', - ), - migrations.RemoveField( - model_name='scraper', - name='headers', - ), - migrations.RemoveField( - model_name='scraper', - name='meta', - ), - migrations.RemoveField( - model_name='scraper', - name='method', - ), - migrations.RemoveField( - model_name='scraper', - name='render_javascript', - ), - migrations.RemoveField( - model_name='scraper', - name='request_type', - ), - migrations.RemoveField( - model_name='scraperelem', - name='from_detail_page', - ), - ] diff --git a/dynamic_scraper/migrations/0010_move_save_to_db_to_scraped_obj_attr.py b/dynamic_scraper/migrations/0010_move_save_to_db_to_scraped_obj_attr.py deleted file mode 100644 index 487120ee..00000000 --- a/dynamic_scraper/migrations/0010_move_save_to_db_to_scraped_obj_attr.py +++ /dev/null @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0009_removed_legacy_request_page_type_scraper_fields'), - ] - - operations = [ - migrations.RemoveField( - model_name='scraperelem', - name='save_to_db', - ), - migrations.AddField( - model_name='scrapedobjattr', - name='save_to_db', - field=models.BooleanField(default=True), - ), - ] diff --git a/dynamic_scraper/migrations/0011_extracted_checker_attributes_to_own_checker_class.py b/dynamic_scraper/migrations/0011_extracted_checker_attributes_to_own_checker_class.py deleted file mode 100644 index f7a067d5..00000000 --- a/dynamic_scraper/migrations/0011_extracted_checker_attributes_to_own_checker_class.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -def create_default_checker_objects(apps, schema_editor): - Scraper = apps.get_model("dynamic_scraper", "Scraper") - Checker = apps.get_model("dynamic_scraper", "Checker") - for s in Scraper.objects.all(): - url_elem = None - url_id_elems = s.scraperelem_set.filter(scraped_obj_attr__attr_type='U', scraped_obj_attr__id_field=True) - url_elems = s.scraperelem_set.filter(scraped_obj_attr__attr_type='U') - if url_id_elems.count() > 0: - url_elem = url_id_elems[0] - elif url_elems.count() > 0: - url_elem = url_elems[0] - if s.checker_type != 'N' and url_elem: - c = Checker(scraped_obj_attr=url_elem.scraped_obj_attr, scraper=s, checker_type=s.checker_type, \ - checker_x_path=s.checker_x_path, checker_x_path_result=s.checker_x_path_result, \ - checker_ref_url=s.checker_ref_url) - c.save() - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0010_move_save_to_db_to_scraped_obj_attr'), - ] - - operations = [ - migrations.CreateModel( - name='Checker', - fields=[ - ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), - ('checker_type', models.CharField(default=b'4', max_length=1, choices=[(b'4', b'404'), (b'X', b'404_OR_X_PATH')])), - ('checker_x_path', models.CharField(max_length=200, blank=True)), - ('checker_x_path_result', models.CharField(max_length=200, blank=True)), - ('checker_ref_url', models.URLField(max_length=500, blank=True)), - ('comments', models.TextField(blank=True)), - ('scraped_obj_attr', models.ForeignKey(help_text=b'Attribute of type DETAIL_PAGE_URL, several checkers for same DETAIL_PAGE_URL attribute possible.', to='dynamic_scraper.ScrapedObjAttr')), - ('scraper', models.ForeignKey(to='dynamic_scraper.Scraper')), - ], - ), - migrations.AddField( - model_name='requestpagetype', - name='comments', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='requestpagetype', - name='scraped_obj_attr', - field=models.ForeignKey(blank=True, to='dynamic_scraper.ScrapedObjAttr', help_text=b'Empty for main page, attribute of type DETAIL_PAGE_URL scraped from main page for detail pages.', null=True), - ), - migrations.RunPython(create_default_checker_objects), - ] diff --git a/dynamic_scraper/migrations/0012_removed_legacy_checker_scraper_attributes.py b/dynamic_scraper/migrations/0012_removed_legacy_checker_scraper_attributes.py deleted file mode 100644 index 2ad56743..00000000 --- a/dynamic_scraper/migrations/0012_removed_legacy_checker_scraper_attributes.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0011_extracted_checker_attributes_to_own_checker_class'), - ] - - operations = [ - migrations.RemoveField( - model_name='scraper', - name='checker_ref_url', - ), - migrations.RemoveField( - model_name='scraper', - name='checker_type', - ), - migrations.RemoveField( - model_name='scraper', - name='checker_x_path', - ), - migrations.RemoveField( - model_name='scraper', - name='checker_x_path_result', - ), - ] diff --git a/dynamic_scraper/migrations/0013_added_scraper_save_and_checker_delete_datetime_fields.py b/dynamic_scraper/migrations/0013_added_scraper_save_and_checker_delete_datetime_fields.py deleted file mode 100644 index 5f5b6a57..00000000 --- a/dynamic_scraper/migrations/0013_added_scraper_save_and_checker_delete_datetime_fields.py +++ /dev/null @@ -1,28 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import models, migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0012_removed_legacy_checker_scraper_attributes'), - ] - - operations = [ - migrations.AlterModelOptions( - name='scrapedobjclass', - options={'ordering': ['name'], 'verbose_name': 'Scraped object class', 'verbose_name_plural': 'Scraped object classes'}, - ), - migrations.AddField( - model_name='scraper', - name='last_checker_delete', - field=models.DateTimeField(null=True, blank=True), - ), - migrations.AddField( - model_name='scraper', - name='last_scraper_save', - field=models.DateTimeField(null=True, blank=True), - ), - ] diff --git a/dynamic_scraper/migrations/0014_added_scraper_save_and_checker_delete_alert_period_fields_for_scraper.py b/dynamic_scraper/migrations/0014_added_scraper_save_and_checker_delete_alert_period_fields_for_scraper.py deleted file mode 100644 index 43b21cbb..00000000 --- a/dynamic_scraper/migrations/0014_added_scraper_save_and_checker_delete_alert_period_fields_for_scraper.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0013_added_scraper_save_and_checker_delete_datetime_fields'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='last_checker_delete_alert_period', - field=models.CharField(help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, syntax: h or d or w (e.g. '6h', '5d', '2w')", max_length=5, blank=True), - ), - migrations.AddField( - model_name='scraper', - name='last_scraper_save_alert_period', - field=models.CharField(help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, syntax: h or d or w (e.g. '6h', '5d', '2w')", max_length=5, blank=True), - ), - ] diff --git a/dynamic_scraper/migrations/0015_added_datetime_fields_for_last_scraper_save_and_checker_delete_alert.py b/dynamic_scraper/migrations/0015_added_datetime_fields_for_last_scraper_save_and_checker_delete_alert.py deleted file mode 100644 index bb030c8b..00000000 --- a/dynamic_scraper/migrations/0015_added_datetime_fields_for_last_scraper_save_and_checker_delete_alert.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import migrations, models -import datetime - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0014_added_scraper_save_and_checker_delete_alert_period_fields_for_scraper'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='next_last_checker_delete_alert', - field=models.DateTimeField(default=datetime.datetime.now, help_text='Next time the last checker delete will be alerted, normally set on management cmd run.'), - ), - migrations.AddField( - model_name='scraper', - name='next_last_scraper_save_alert', - field=models.DateTimeField(default=datetime.datetime.now, help_text='Next time the last scraper save will be alerted, normally set on management cmd run.'), - ), - migrations.AlterField( - model_name='scraper', - name='last_checker_delete_alert_period', - field=models.CharField(help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')", max_length=5, blank=True), - ), - migrations.AlterField( - model_name='scraper', - name='last_scraper_save_alert_period', - field=models.CharField(help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')", max_length=5, blank=True), - ), - ] diff --git a/dynamic_scraper/migrations/0016_optional_xpath_fields_text_type_for_x_path_reg_exp_processor_fields.py b/dynamic_scraper/migrations/0016_optional_xpath_fields_text_type_for_x_path_reg_exp_processor_fields.py deleted file mode 100644 index f6263c87..00000000 --- a/dynamic_scraper/migrations/0016_optional_xpath_fields_text_type_for_x_path_reg_exp_processor_fields.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0015_added_datetime_fields_for_last_scraper_save_and_checker_delete_alert'), - ] - - operations = [ - migrations.AlterField( - model_name='checker', - name='checker_x_path', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='checker', - name='checker_x_path_result', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='scraperelem', - name='proc_ctxt', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='scraperelem', - name='processors', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='scraperelem', - name='reg_exp', - field=models.TextField(blank=True), - ), - migrations.AlterField( - model_name='scraperelem', - name='x_path', - field=models.TextField(blank=True), - ), - ] diff --git a/dynamic_scraper/migrations/0017_added_order_to_scraped_obj_attr.py b/dynamic_scraper/migrations/0017_added_order_to_scraped_obj_attr.py deleted file mode 100644 index 05a54c6a..00000000 --- a/dynamic_scraper/migrations/0017_added_order_to_scraped_obj_attr.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0016_optional_xpath_fields_text_type_for_x_path_reg_exp_processor_fields'), - ] - - operations = [ - migrations.AlterModelOptions( - name='scrapedobjattr', - options={'ordering': ['order']}, - ), - migrations.AlterModelOptions( - name='scraperelem', - options={'ordering': ['scraped_obj_attr__order']}, - ), - migrations.AddField( - model_name='scrapedobjattr', - name='order', - field=models.IntegerField(default=100), - ), - ] diff --git a/dynamic_scraper/migrations/0018_added_use_default_procs_field_to_scraper_elems.py b/dynamic_scraper/migrations/0018_added_use_default_procs_field_to_scraper_elems.py deleted file mode 100644 index d9638866..00000000 --- a/dynamic_scraper/migrations/0018_added_use_default_procs_field_to_scraper_elems.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-07 03:26 -from __future__ import unicode_literals - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0017_added_order_to_scraped_obj_attr'), - ] - - operations = [ - migrations.AddField( - model_name='scraperelem', - name='use_default_procs', - field=models.BooleanField(default=True, help_text='Use the default processors (Scrapy TakeFirst, DDS string_strip) for convenience.'), - ), - migrations.AlterField( - model_name='scraperelem', - name='mandatory', - field=models.BooleanField(default=True, help_text='Drop item if attribute could not be scraped.'), - ), - migrations.AlterField( - model_name='scraperelem', - name='proc_ctxt', - field=models.TextField(blank=True, help_text="Comma-separated aditional context (depending on processor) (e.g. 'pre_url': 'http://append_before.org/', 'post_string': '?append_after=True')."), - ), - migrations.AlterField( - model_name='scraperelem', - name='processors', - field=models.TextField(blank=True, help_text='Optional comma-separated list of processors used (e.g. "pre_url, post_string").'), - ), - migrations.AlterField( - model_name='scraperelem', - name='reg_exp', - field=models.TextField(blank=True, help_text="Optional filtering by regular expression (e.g. 'Scrape only (.*) the text in between')."), - ), - migrations.AlterField( - model_name='scraperelem', - name='request_page_type', - field=models.CharField(choices=[('MP', 'Main Page'), ('DP1', 'Detail Page 1'), ('DP2', 'Detail Page 2'), ('DP3', 'Detail Page 3'), ('DP4', 'Detail Page 4'), ('DP5', 'Detail Page 5'), ('DP6', 'Detail Page 6'), ('DP7', 'Detail Page 7'), ('DP8', 'Detail Page 8'), ('DP9', 'Detail Page 9'), ('DP10', 'Detail Page 10'), ('DP11', 'Detail Page 11'), ('DP12', 'Detail Page 12'), ('DP13', 'Detail Page 13'), ('DP14', 'Detail Page 14'), ('DP15', 'Detail Page 15'), ('DP16', 'Detail Page 16'), ('DP17', 'Detail Page 17'), ('DP18', 'Detail Page 18'), ('DP19', 'Detail Page 19'), ('DP20', 'Detail Page 20'), ('DP21', 'Detail Page 21'), ('DP22', 'Detail Page 22'), ('DP23', 'Detail Page 23'), ('DP24', 'Detail Page 24'), ('DP25', 'Detail Page 25')], default='MP', help_text='Corresponding Request Page Types created for this scraper.', max_length=3), - ), - migrations.AlterField( - model_name='scraperelem', - name='scraped_obj_attr', - field=models.ForeignKey(help_text='The different attributes to be scraped, exactly one attribute of type BASE necessary.', on_delete=django.db.models.deletion.CASCADE, to='dynamic_scraper.ScrapedObjAttr'), - ), - migrations.AlterField( - model_name='scraperelem', - name='x_path', - field=models.TextField(blank=True, help_text='XPath or JSONPath expression, leave blank on "static" processor use.'), - ), - ] diff --git a/dynamic_scraper/migrations/0019_updated_help_texts_for_request_info_fields.py b/dynamic_scraper/migrations/0019_updated_help_texts_for_request_info_fields.py deleted file mode 100644 index 96c17908..00000000 --- a/dynamic_scraper/migrations/0019_updated_help_texts_for_request_info_fields.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-09 07:26 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0018_added_use_default_procs_field_to_scraper_elems'), - ] - - operations = [ - migrations.AlterField( - model_name='requestpagetype', - name='body', - field=models.TextField(blank=True, help_text='Optional HTTP message body provided as a unicode string, can use {main page attribute} and {page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='cookies', - field=models.TextField(blank=True, help_text='Optional cookies as JSON dict (use double quotes!), can use {main page attribute} and {page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='form_data', - field=models.TextField(blank=True, help_text='Optional HTML form data as JSON dict (use double quotes!), only used with FormRequest request type, can use {main page attribute} and {page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='headers', - field=models.TextField(blank=True, help_text='Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)), can use {main page attribute} and {page} placeholders.'), - ), - ] diff --git a/dynamic_scraper/migrations/0020_added_work_status_field_to_scraper.py b/dynamic_scraper/migrations/0020_added_work_status_field_to_scraper.py deleted file mode 100644 index 8e7ff23c..00000000 --- a/dynamic_scraper/migrations/0020_added_work_status_field_to_scraper.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-12 05:25 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0019_updated_help_texts_for_request_info_fields'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='work_status', - field=models.CharField(choices=[('R2', 'REVISION NEEDED (MAJOR)'), ('R1', 'REVISION NEEDED (MINOR)'), ('BR', 'BROKEN'), ('W', 'WORKING'), ('RC', 'RELEASE CANDIDATE'), ('BE', 'BETA'), ('A', 'ALPHA'), ('D', 'DRAFT'), ('S', 'SUSPENDED'), ('U', 'UNKNOWN'), ('N', 'NOT SET')], default='N', max_length=2), - ), - ] diff --git a/dynamic_scraper/migrations/0021_added_owner_field_to_scraper.py b/dynamic_scraper/migrations/0021_added_owner_field_to_scraper.py deleted file mode 100644 index 31507c61..00000000 --- a/dynamic_scraper/migrations/0021_added_owner_field_to_scraper.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-12 05:48 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0020_added_work_status_field_to_scraper'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='owner', - field=models.CharField(blank=True, help_text='Optional owner when working on scrapers with various people', max_length=12), - ), - migrations.AlterField( - model_name='scraper', - name='status', - field=models.CharField(choices=[('A', 'ACTIVE'), ('M', 'MANUAL'), ('P', 'PAUSED'), ('I', 'INACTIVE')], default='P', help_text='Runtime status of the scraper, used by scheduling mechanism.', max_length=1), - ), - migrations.AlterField( - model_name='scraper', - name='work_status', - field=models.CharField(choices=[('R2', 'REVISION NEEDED (MAJOR)'), ('R1', 'REVISION NEEDED (MINOR)'), ('BR', 'BROKEN'), ('W', 'WORKING'), ('RC', 'RELEASE CANDIDATE'), ('BE', 'BETA'), ('A', 'ALPHA'), ('D', 'DRAFT'), ('S', 'SUSPENDED'), ('U', 'UNKNOWN'), ('N', 'NOT SET')], default='N', help_text='Internal work/progress status of the scraper.', max_length=2), - ), - ] diff --git a/dynamic_scraper/migrations/0022_added_option_for_scraper_work_status.py b/dynamic_scraper/migrations/0022_added_option_for_scraper_work_status.py deleted file mode 100644 index ebb90047..00000000 --- a/dynamic_scraper/migrations/0022_added_option_for_scraper_work_status.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-13 06:16 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0021_added_owner_field_to_scraper'), - ] - - operations = [ - migrations.AlterField( - model_name='scraper', - name='work_status', - field=models.CharField(choices=[('R2', 'REVISION NEEDED (MAJOR)'), ('R1', 'REVISION NEEDED (MINOR)'), ('UR', 'UNRESOLVED'), ('BR', 'BROKEN'), ('W', 'WORKING'), ('RC', 'RELEASE CANDIDATE'), ('BE', 'BETA'), ('A', 'ALPHA'), ('D', 'DRAFT'), ('S', 'SUSPENDED'), ('U', 'UNKNOWN'), ('N', 'NOT SET')], default='N', help_text='Internal work/progress status of the scraper.', max_length=2), - ), - ] diff --git a/dynamic_scraper/migrations/0023_added_follow_pages_by_xpath_and_num_pages_follow_atts_to_scraper_pagination.py b/dynamic_scraper/migrations/0023_added_follow_pages_by_xpath_and_num_pages_follow_atts_to_scraper_pagination.py deleted file mode 100644 index 5de2193d..00000000 --- a/dynamic_scraper/migrations/0023_added_follow_pages_by_xpath_and_num_pages_follow_atts_to_scraper_pagination.py +++ /dev/null @@ -1,30 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-14 05:31 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0022_added_option_for_scraper_work_status'), - ] - - operations = [ - migrations.AddField( - model_name='scraper', - name='follow_pages_by_xpath', - field=models.TextField(blank=True, help_text='Optional, follow links from a single non-paginated or all statically paginated (RANGE_FUNCT, FREE_LIST) main pages'), - ), - migrations.AddField( - model_name='scraper', - name='num_pages_follow', - field=models.IntegerField(blank=True, help_text='Optionally limit number of pages to follow (default: follow until XPath fails)', null=True), - ), - migrations.AlterField( - model_name='scraper', - name='pagination_type', - field=models.CharField(choices=[('N', 'NONE'), ('R', 'RANGE_FUNCT (+FOLLOW)'), ('F', 'FREE_LIST (+FOLLOW)'), ('O', 'FOLLOW')], default='N', max_length=1), - ), - ] diff --git a/dynamic_scraper/migrations/0024_new_follow_page_type_choice_for_rpt.py b/dynamic_scraper/migrations/0024_new_follow_page_type_choice_for_rpt.py deleted file mode 100644 index bfcf81b0..00000000 --- a/dynamic_scraper/migrations/0024_new_follow_page_type_choice_for_rpt.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-14 06:22 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0023_added_follow_pages_by_xpath_and_num_pages_follow_atts_to_scraper_pagination'), - ] - - operations = [ - migrations.AlterField( - model_name='requestpagetype', - name='page_type', - field=models.CharField(choices=[('MP', 'Main Page'), ('FP', 'Follow Page'), ('DP1', 'Detail Page 1'), ('DP2', 'Detail Page 2'), ('DP3', 'Detail Page 3'), ('DP4', 'Detail Page 4'), ('DP5', 'Detail Page 5'), ('DP6', 'Detail Page 6'), ('DP7', 'Detail Page 7'), ('DP8', 'Detail Page 8'), ('DP9', 'Detail Page 9'), ('DP10', 'Detail Page 10'), ('DP11', 'Detail Page 11'), ('DP12', 'Detail Page 12'), ('DP13', 'Detail Page 13'), ('DP14', 'Detail Page 14'), ('DP15', 'Detail Page 15'), ('DP16', 'Detail Page 16'), ('DP17', 'Detail Page 17'), ('DP18', 'Detail Page 18'), ('DP19', 'Detail Page 19'), ('DP20', 'Detail Page 20'), ('DP21', 'Detail Page 21'), ('DP22', 'Detail Page 22'), ('DP23', 'Detail Page 23'), ('DP24', 'Detail Page 24'), ('DP25', 'Detail Page 25')], help_text='One main page RPT, an optional follow page RPT (if follow pagination is used) and detail page RPTs for all DETAIL_PAGE_URLs', max_length=3), - ), - ] diff --git a/dynamic_scraper/migrations/0025_new_follow_pages_page_xpath_pagination_attribute.py b/dynamic_scraper/migrations/0025_new_follow_pages_page_xpath_pagination_attribute.py deleted file mode 100644 index 5dc0cb5e..00000000 --- a/dynamic_scraper/migrations/0025_new_follow_pages_page_xpath_pagination_attribute.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by Django 1.11b1 on 2017-06-23 03:16 -from __future__ import unicode_literals - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('dynamic_scraper', '0024_new_follow_page_type_choice_for_rpt'), - ] - - operations = [ - migrations.RenameField( - model_name='scraper', - old_name='follow_pages_by_xpath', - new_name='follow_pages_url_xpath', - ), - migrations.AddField( - model_name='scraper', - name='follow_pages_page_xpath', - field=models.TextField(blank=True, help_text='Optional additional XPath for the page number, can be used in {follow_page} placeholder.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='body', - field=models.TextField(blank=True, help_text='Optional HTTP message body provided as a unicode string, can use {main page attribute}, {page} and {follow_page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='cookies', - field=models.TextField(blank=True, help_text='Optional cookies as JSON dict (use double quotes!), can use {main page attribute}, {page} and {follow_page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='form_data', - field=models.TextField(blank=True, help_text='Optional HTML form data as JSON dict (use double quotes!), only used with FormRequest request type, can use {main page attribute}, {page} and {follow_page} placeholders.'), - ), - migrations.AlterField( - model_name='requestpagetype', - name='headers', - field=models.TextField(blank=True, help_text='Optional HTTP headers sent with each request, provided as a JSON dict (e.g. {"Referer":"http://referer_url"}, use double quotes!)), can use {main page attribute}, {page} and {follow_page} placeholders.'), - ), - ] diff --git a/dynamic_scraper/south_migrations/0001_initial.py b/dynamic_scraper/south_migrations/0001_initial.py deleted file mode 100644 index 0c121279..00000000 --- a/dynamic_scraper/south_migrations/0001_initial.py +++ /dev/null @@ -1,174 +0,0 @@ -# encoding: utf-8 -import datetime -from south.db import db -from south.v2 import SchemaMigration -from django.db import models - -class Migration(SchemaMigration): - - def forwards(self, orm): - - # Adding model 'ScrapedObjClass' - db.create_table('dynamic_scraper_scrapedobjclass', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('name', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('scraper_scheduler_conf', self.gf('django.db.models.fields.TextField')(default='"MIN_TIME": 15,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 10,\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), - ('checker_scheduler_conf', self.gf('django.db.models.fields.TextField')(default='"MIN_TIME": 1440,\n"MAX_TIME": 10080,\n"INITIAL_NEXT_ACTION_FACTOR": 1,\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\n"FACTOR_CHANGE_FACTOR": 1.3,\n')), - ('comments', self.gf('django.db.models.fields.TextField')(blank=True)), - )) - db.send_create_signal('dynamic_scraper', ['ScrapedObjClass']) - - # Adding model 'ScrapedObjAttr' - db.create_table('dynamic_scraper_scrapedobjattr', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('name', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('obj_class', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.ScrapedObjClass'])), - ('attr_type', self.gf('django.db.models.fields.CharField')(max_length=1)), - )) - db.send_create_signal('dynamic_scraper', ['ScrapedObjAttr']) - - # Adding model 'Scraper' - db.create_table('dynamic_scraper_scraper', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('name', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('scraped_obj_class', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.ScrapedObjClass'])), - ('status', self.gf('django.db.models.fields.CharField')(default='P', max_length=1)), - ('max_items_read', self.gf('django.db.models.fields.IntegerField')(null=True, blank=True)), - ('max_items_save', self.gf('django.db.models.fields.IntegerField')(null=True, blank=True)), - ('pagination_type', self.gf('django.db.models.fields.CharField')(default='N', max_length=1)), - ('pagination_on_start', self.gf('django.db.models.fields.BooleanField')(default=False)), - ('pagination_append_str', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('pagination_page_replace', self.gf('django.db.models.fields.TextField')(blank=True)), - ('checker_type', self.gf('django.db.models.fields.CharField')(default='N', max_length=1)), - ('checker_x_path', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('checker_x_path_result', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('checker_ref_url', self.gf('django.db.models.fields.URLField')(max_length=200, blank=True)), - ('comments', self.gf('django.db.models.fields.TextField')(blank=True)), - )) - db.send_create_signal('dynamic_scraper', ['Scraper']) - - # Adding model 'ScraperElem' - db.create_table('dynamic_scraper_scraperelem', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('scraped_obj_attr', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.ScrapedObjAttr'])), - ('scraper', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.Scraper'])), - ('x_path', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('reg_exp', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('from_detail_page', self.gf('django.db.models.fields.BooleanField')(default=False)), - ('processors', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('proc_ctxt', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('mandatory', self.gf('django.db.models.fields.BooleanField')(default=True)), - )) - db.send_create_signal('dynamic_scraper', ['ScraperElem']) - - # Adding model 'SchedulerRuntime' - db.create_table('dynamic_scraper_schedulerruntime', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('runtime_type', self.gf('django.db.models.fields.CharField')(default='P', max_length=1)), - ('next_action_time', self.gf('django.db.models.fields.DateTimeField')(default=datetime.datetime.now)), - ('next_action_factor', self.gf('django.db.models.fields.FloatField')(null=True, blank=True)), - ('num_zero_actions', self.gf('django.db.models.fields.IntegerField')(default=0)), - )) - db.send_create_signal('dynamic_scraper', ['SchedulerRuntime']) - - # Adding model 'Log' - db.create_table('dynamic_scraper_log', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('message', self.gf('django.db.models.fields.CharField')(max_length=255)), - ('ref_object', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('level', self.gf('django.db.models.fields.IntegerField')()), - ('spider_name', self.gf('django.db.models.fields.CharField')(max_length=200)), - ('scraper', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.Scraper'], null=True, blank=True)), - ('date', self.gf('django.db.models.fields.DateTimeField')(default=datetime.datetime.now)), - )) - db.send_create_signal('dynamic_scraper', ['Log']) - - - def backwards(self, orm): - - # Deleting model 'ScrapedObjClass' - db.delete_table('dynamic_scraper_scrapedobjclass') - - # Deleting model 'ScrapedObjAttr' - db.delete_table('dynamic_scraper_scrapedobjattr') - - # Deleting model 'Scraper' - db.delete_table('dynamic_scraper_scraper') - - # Deleting model 'ScraperElem' - db.delete_table('dynamic_scraper_scraperelem') - - # Deleting model 'SchedulerRuntime' - db.delete_table('dynamic_scraper_schedulerruntime') - - # Deleting model 'Log' - db.delete_table('dynamic_scraper_log') - - - models = { - 'dynamic_scraper.log': { - 'Meta': {'ordering': "['-date']", 'object_name': 'Log'}, - 'date': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'level': ('django.db.models.fields.IntegerField', [], {}), - 'message': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - }, - 'dynamic_scraper.schedulerruntime': { - 'Meta': {'object_name': 'SchedulerRuntime'}, - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'next_action_factor': ('django.db.models.fields.FloatField', [], {'null': 'True', 'blank': 'True'}), - 'next_action_time': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'num_zero_actions': ('django.db.models.fields.IntegerField', [], {'default': '0'}), - 'runtime_type': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scrapedobjattr': { - 'Meta': {'object_name': 'ScrapedObjAttr'}, - 'attr_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}) - }, - 'dynamic_scraper.scrapedobjclass': { - 'Meta': {'object_name': 'ScrapedObjClass'}, - 'checker_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 1440,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 1,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 15,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 10,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}) - }, - 'dynamic_scraper.scraper': { - 'Meta': {'object_name': 'Scraper'}, - 'checker_ref_url': ('django.db.models.fields.URLField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'checker_x_path': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_x_path_result': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'max_items_read': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'max_items_save': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'pagination_append_str': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'pagination_on_start': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'pagination_page_replace': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'pagination_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'scraped_obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}), - 'status': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scraperelem': { - 'Meta': {'object_name': 'ScraperElem'}, - 'from_detail_page': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mandatory': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), - 'proc_ctxt': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'processors': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'reg_exp': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraped_obj_attr': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjAttr']"}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']"}), - 'x_path': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - } - } - - complete_apps = ['dynamic_scraper'] diff --git a/dynamic_scraper/south_migrations/0002_auto__add_field_scraper_content_type.py b/dynamic_scraper/south_migrations/0002_auto__add_field_scraper_content_type.py deleted file mode 100644 index 0f2cb07a..00000000 --- a/dynamic_scraper/south_migrations/0002_auto__add_field_scraper_content_type.py +++ /dev/null @@ -1,88 +0,0 @@ -# encoding: utf-8 -import datetime -from south.db import db -from south.v2 import SchemaMigration -from django.db import models - -class Migration(SchemaMigration): - - def forwards(self, orm): - - # Adding field 'Scraper.content_type' - db.add_column('dynamic_scraper_scraper', 'content_type', self.gf('django.db.models.fields.CharField')(default='H', max_length=1), keep_default=False) - - - def backwards(self, orm): - - # Deleting field 'Scraper.content_type' - db.delete_column('dynamic_scraper_scraper', 'content_type') - - - models = { - 'dynamic_scraper.log': { - 'Meta': {'ordering': "['-date']", 'object_name': 'Log'}, - 'date': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'level': ('django.db.models.fields.IntegerField', [], {}), - 'message': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - }, - 'dynamic_scraper.schedulerruntime': { - 'Meta': {'object_name': 'SchedulerRuntime'}, - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'next_action_factor': ('django.db.models.fields.FloatField', [], {'null': 'True', 'blank': 'True'}), - 'next_action_time': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'num_zero_actions': ('django.db.models.fields.IntegerField', [], {'default': '0'}), - 'runtime_type': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scrapedobjattr': { - 'Meta': {'object_name': 'ScrapedObjAttr'}, - 'attr_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}) - }, - 'dynamic_scraper.scrapedobjclass': { - 'Meta': {'object_name': 'ScrapedObjClass'}, - 'checker_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 1440,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 1,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 15,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 10,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}) - }, - 'dynamic_scraper.scraper': { - 'Meta': {'object_name': 'Scraper'}, - 'checker_ref_url': ('django.db.models.fields.URLField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'checker_x_path': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_x_path_result': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'content_type': ('django.db.models.fields.CharField', [], {'default': "'H'", 'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'max_items_read': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'max_items_save': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'pagination_append_str': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'pagination_on_start': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'pagination_page_replace': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'pagination_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'scraped_obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}), - 'status': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scraperelem': { - 'Meta': {'object_name': 'ScraperElem'}, - 'from_detail_page': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mandatory': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), - 'proc_ctxt': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'processors': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'reg_exp': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraped_obj_attr': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjAttr']"}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']"}), - 'x_path': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - } - } - - complete_apps = ['dynamic_scraper'] diff --git a/dynamic_scraper/south_migrations/0003_auto__add_logmarker__add_field_log_type.py b/dynamic_scraper/south_migrations/0003_auto__add_logmarker__add_field_log_type.py deleted file mode 100644 index ee437931..00000000 --- a/dynamic_scraper/south_migrations/0003_auto__add_logmarker__add_field_log_type.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -import datetime -from south.db import db -from south.v2 import SchemaMigration -from django.db import models - - -class Migration(SchemaMigration): - - def forwards(self, orm): - # Adding model 'LogMarker' - db.create_table('dynamic_scraper_logmarker', ( - ('id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), - ('message_contains', self.gf('django.db.models.fields.CharField')(max_length=255)), - ('ref_object', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('mark_with_type', self.gf('django.db.models.fields.CharField')(max_length=2)), - ('custom_type', self.gf('django.db.models.fields.CharField')(max_length=25, blank=True)), - ('spider_name', self.gf('django.db.models.fields.CharField')(max_length=200, blank=True)), - ('scraper', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['dynamic_scraper.Scraper'], null=True, blank=True)), - )) - db.send_create_signal('dynamic_scraper', ['LogMarker']) - - # Adding field 'Log.type' - db.add_column('dynamic_scraper_log', 'type', - self.gf('django.db.models.fields.CharField')(default='', max_length=25, blank=True), - keep_default=False) - - - def backwards(self, orm): - # Deleting model 'LogMarker' - db.delete_table('dynamic_scraper_logmarker') - - # Deleting field 'Log.type' - db.delete_column('dynamic_scraper_log', 'type') - - - models = { - 'dynamic_scraper.log': { - 'Meta': {'ordering': "['-date']", 'object_name': 'Log'}, - 'date': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'level': ('django.db.models.fields.IntegerField', [], {}), - 'message': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'type': ('django.db.models.fields.CharField', [], {'max_length': '25', 'blank': 'True'}) - }, - 'dynamic_scraper.logmarker': { - 'Meta': {'object_name': 'LogMarker'}, - 'custom_type': ('django.db.models.fields.CharField', [], {'max_length': '25', 'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mark_with_type': ('django.db.models.fields.CharField', [], {'max_length': '2'}), - 'message_contains': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}) - }, - 'dynamic_scraper.schedulerruntime': { - 'Meta': {'ordering': "['next_action_time']", 'object_name': 'SchedulerRuntime'}, - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'next_action_factor': ('django.db.models.fields.FloatField', [], {'null': 'True', 'blank': 'True'}), - 'next_action_time': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'num_zero_actions': ('django.db.models.fields.IntegerField', [], {'default': '0'}), - 'runtime_type': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scrapedobjattr': { - 'Meta': {'object_name': 'ScrapedObjAttr'}, - 'attr_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}) - }, - 'dynamic_scraper.scrapedobjclass': { - 'Meta': {'ordering': "['name']", 'object_name': 'ScrapedObjClass'}, - 'checker_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 1440,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 1,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 15,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 10,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}) - }, - 'dynamic_scraper.scraper': { - 'Meta': {'ordering': "['name', 'scraped_obj_class']", 'object_name': 'Scraper'}, - 'checker_ref_url': ('django.db.models.fields.URLField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'checker_x_path': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_x_path_result': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'content_type': ('django.db.models.fields.CharField', [], {'default': "'H'", 'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'max_items_read': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'max_items_save': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'pagination_append_str': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'pagination_on_start': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'pagination_page_replace': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'pagination_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'scraped_obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}), - 'status': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scraperelem': { - 'Meta': {'object_name': 'ScraperElem'}, - 'from_detail_page': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mandatory': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), - 'proc_ctxt': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'processors': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'reg_exp': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraped_obj_attr': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjAttr']"}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']"}), - 'x_path': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - } - } - - complete_apps = ['dynamic_scraper'] \ No newline at end of file diff --git a/dynamic_scraper/south_migrations/0004_auto__chg_field_scraper_checker_ref_url.py b/dynamic_scraper/south_migrations/0004_auto__chg_field_scraper_checker_ref_url.py deleted file mode 100644 index a3c239f9..00000000 --- a/dynamic_scraper/south_migrations/0004_auto__chg_field_scraper_checker_ref_url.py +++ /dev/null @@ -1,98 +0,0 @@ -# -*- coding: utf-8 -*- -import datetime -from south.db import db -from south.v2 import SchemaMigration -from django.db import models - - -class Migration(SchemaMigration): - - def forwards(self, orm): - - # Changing field 'Scraper.checker_ref_url' - db.alter_column('dynamic_scraper_scraper', 'checker_ref_url', self.gf('django.db.models.fields.URLField')(max_length=500)) - - def backwards(self, orm): - - # Changing field 'Scraper.checker_ref_url' - db.alter_column('dynamic_scraper_scraper', 'checker_ref_url', self.gf('django.db.models.fields.URLField')(max_length=200)) - - models = { - 'dynamic_scraper.log': { - 'Meta': {'ordering': "['-date']", 'object_name': 'Log'}, - 'date': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'level': ('django.db.models.fields.IntegerField', [], {}), - 'message': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'type': ('django.db.models.fields.CharField', [], {'max_length': '25', 'blank': 'True'}) - }, - 'dynamic_scraper.logmarker': { - 'Meta': {'object_name': 'LogMarker'}, - 'custom_type': ('django.db.models.fields.CharField', [], {'max_length': '25', 'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mark_with_type': ('django.db.models.fields.CharField', [], {'max_length': '2'}), - 'message_contains': ('django.db.models.fields.CharField', [], {'max_length': '255'}), - 'ref_object': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']", 'null': 'True', 'blank': 'True'}), - 'spider_name': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}) - }, - 'dynamic_scraper.schedulerruntime': { - 'Meta': {'ordering': "['next_action_time']", 'object_name': 'SchedulerRuntime'}, - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'next_action_factor': ('django.db.models.fields.FloatField', [], {'null': 'True', 'blank': 'True'}), - 'next_action_time': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), - 'num_zero_actions': ('django.db.models.fields.IntegerField', [], {'default': '0'}), - 'runtime_type': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scrapedobjattr': { - 'Meta': {'object_name': 'ScrapedObjAttr'}, - 'attr_type': ('django.db.models.fields.CharField', [], {'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}) - }, - 'dynamic_scraper.scrapedobjclass': { - 'Meta': {'ordering': "['name']", 'object_name': 'ScrapedObjClass'}, - 'checker_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 1440,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 1,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 5,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'scraper_scheduler_conf': ('django.db.models.fields.TextField', [], {'default': '\'"MIN_TIME": 15,\\n"MAX_TIME": 10080,\\n"INITIAL_NEXT_ACTION_FACTOR": 10,\\n"ZERO_ACTIONS_FACTOR_CHANGE": 20,\\n"FACTOR_CHANGE_FACTOR": 1.3,\\n\''}) - }, - 'dynamic_scraper.scraper': { - 'Meta': {'ordering': "['name', 'scraped_obj_class']", 'object_name': 'Scraper'}, - 'checker_ref_url': ('django.db.models.fields.URLField', [], {'max_length': '500', 'blank': 'True'}), - 'checker_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'checker_x_path': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'checker_x_path_result': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'comments': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'content_type': ('django.db.models.fields.CharField', [], {'default': "'H'", 'max_length': '1'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'max_items_read': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'max_items_save': ('django.db.models.fields.IntegerField', [], {'null': 'True', 'blank': 'True'}), - 'name': ('django.db.models.fields.CharField', [], {'max_length': '200'}), - 'pagination_append_str': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'pagination_on_start': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'pagination_page_replace': ('django.db.models.fields.TextField', [], {'blank': 'True'}), - 'pagination_type': ('django.db.models.fields.CharField', [], {'default': "'N'", 'max_length': '1'}), - 'scraped_obj_class': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjClass']"}), - 'status': ('django.db.models.fields.CharField', [], {'default': "'P'", 'max_length': '1'}) - }, - 'dynamic_scraper.scraperelem': { - 'Meta': {'object_name': 'ScraperElem'}, - 'from_detail_page': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), - 'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), - 'mandatory': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), - 'proc_ctxt': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'processors': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'reg_exp': ('django.db.models.fields.CharField', [], {'max_length': '200', 'blank': 'True'}), - 'scraped_obj_attr': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.ScrapedObjAttr']"}), - 'scraper': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['dynamic_scraper.Scraper']"}), - 'x_path': ('django.db.models.fields.CharField', [], {'max_length': '200'}) - } - } - - complete_apps = ['dynamic_scraper'] \ No newline at end of file From 25542efb5a27f12f65e988036bda6b9af3898900 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 1 Jun 2019 09:59:58 +0300 Subject: [PATCH 04/11] change upgrade requirements --- requirements.txt | 22 +++++++++++----------- tests/scraper/models.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6d828595..b507833c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ -Django>=1.11,<1.12 -Scrapy>=1.5,<1.6 -scrapy-djangoitem>=1.1.1,<1.2 +Django>=2.0 +Scrapy>=1.6 +scrapy-djangoitem==1.1.1 scrapy-splash>=0.7,<0.8 -scrapyd>=1.2,<1.3 -jsonpath-rw>=1.4 -kombu>=3.0.37,<3.1 -Celery==3.1.25 -django-celery==3.2.1 -future>=0.17,<0.18 -pillow>=5.0,<6.0 -attrs>=17.4.0 +scrapyd==1.2 +jsonpath-rw==1.4.0 +kombu==3.0.23 +Celery==3.1.15 +django-celery +future==0.17.1 +pillow==6.0 +attrs==19.1.0 diff --git a/tests/scraper/models.py b/tests/scraper/models.py index 371151e3..931f8df5 100644 --- a/tests/scraper/models.py +++ b/tests/scraper/models.py @@ -21,7 +21,7 @@ def __str__(self): @python_2_unicode_compatible class Event(models.Model): title = models.CharField(max_length=200) - event_website = models.ForeignKey(EventWebsite) + event_website = models.ForeignKey(EventWebsite,on_delete=models.CASCADE) description = models.TextField(blank=True) description2 = models.TextField(blank=True) url = models.URLField(blank=True) From 40747e65065e5326c1fe4154fa83c8ba75bbc50d Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 1 Jun 2019 10:23:51 +0300 Subject: [PATCH 05/11] change upgrade requirements --- requirements.txt | 53 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index b507833c..2f295720 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,44 @@ -Django>=2.0 -Scrapy>=1.6 -scrapy-djangoitem==1.1.1 -scrapy-splash>=0.7,<0.8 -scrapyd==1.2 +amqp==1.4.9 +anyjson==0.3.3 +asn1crypto==0.24.0 +attrs==19.1.0 +Automat==0.7.0 +billiard==3.3.0.23 +celery==3.1.15 +cffi==1.12.3 +constantly==15.1.0 +cryptography==2.7 +cssselect==1.0.3 +decorator==4.4.0 +Django==2.2.1 +django-celery==3.3.0 +future==0.17.1 +hyperlink==19.0.0 +idna==2.8 +incremental==17.5.0 jsonpath-rw==1.4.0 kombu==3.0.23 -Celery==3.1.15 -django-celery -future==0.17.1 -pillow==6.0 -attrs==19.1.0 - +lxml==4.3.3 +parsel==1.5.1 +Pillow==6.0.0 +pkg-resources==0.0.0 +ply==3.11 +pyasn1==0.4.5 +pyasn1-modules==0.2.5 +pycparser==2.19 +PyDispatcher==2.0.5 +PyHamcrest==1.9.0 +pyOpenSSL==19.0.0 +pytz==2019.1 +queuelib==1.5.0 +Scrapy==1.6.0 +scrapy-djangoitem==1.1.1 +scrapy-splash==0.7.2 +scrapyd==1.2.0 +service-identity==18.1.0 +six==1.12.0 +sqlparse==0.3.0 +Twisted==19.2.0 +vine==1.3.0 +w3lib==1.20.0 +zope.interface==4.6.0 From 4e84ceb7d8fc0b6c188126b769ec495dc7557959 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 1 Jun 2019 10:33:32 +0300 Subject: [PATCH 06/11] change upgrade requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2f295720..99443e6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ asn1crypto==0.24.0 attrs==19.1.0 Automat==0.7.0 billiard==3.3.0.23 -celery==3.1.15 +celery==3.1.25 cffi==1.12.3 constantly==15.1.0 cryptography==2.7 From 291f646c5fdbc8defa01ec334620edcdbddf1826 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 1 Jun 2019 11:19:32 +0300 Subject: [PATCH 07/11] change upgrade requirements --- requirements.txt | 54 ++++++++++-------------------------------------- 1 file changed, 11 insertions(+), 43 deletions(-) diff --git a/requirements.txt b/requirements.txt index 99443e6b..f262f516 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,44 +1,12 @@ -amqp==1.4.9 -anyjson==0.3.3 -asn1crypto==0.24.0 -attrs==19.1.0 -Automat==0.7.0 -billiard==3.3.0.23 -celery==3.1.25 -cffi==1.12.3 -constantly==15.1.0 -cryptography==2.7 -cssselect==1.0.3 -decorator==4.4.0 -Django==2.2.1 -django-celery==3.3.0 -future==0.17.1 -hyperlink==19.0.0 -idna==2.8 -incremental==17.5.0 -jsonpath-rw==1.4.0 -kombu==3.0.23 -lxml==4.3.3 -parsel==1.5.1 -Pillow==6.0.0 -pkg-resources==0.0.0 -ply==3.11 -pyasn1==0.4.5 -pyasn1-modules==0.2.5 -pycparser==2.19 -PyDispatcher==2.0.5 -PyHamcrest==1.9.0 -pyOpenSSL==19.0.0 -pytz==2019.1 -queuelib==1.5.0 -Scrapy==1.6.0 +Django>=2 +Scrapy>=1.5 scrapy-djangoitem==1.1.1 -scrapy-splash==0.7.2 -scrapyd==1.2.0 -service-identity==18.1.0 -six==1.12.0 -sqlparse==0.3.0 -Twisted==19.2.0 -vine==1.3.0 -w3lib==1.20.0 -zope.interface==4.6.0 +scrapy-splash==0.7 +scrapyd==1.6 +jsonpath-rw>=1.4 +kombu>=3.0.37,<3.1 +Celery==3.1.25 +django-celery==3.2.1 +future>=0.17,<0.18 +pillow>=5.0,<6.0 +attrs>=17.4.0 \ No newline at end of file From f6a438082688e562e0b618be947f41f06a8eda07 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 15 Jun 2019 12:32:46 +0300 Subject: [PATCH 08/11] change upgrade requirements --- requirements.txt | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index f262f516..a7aa3fb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,10 @@ Django>=2 -Scrapy>=1.5 +Scrapy>=1.6 scrapy-djangoitem==1.1.1 scrapy-splash==0.7 -scrapyd==1.6 +scrapyd==1.2 jsonpath-rw>=1.4 -kombu>=3.0.37,<3.1 -Celery==3.1.25 -django-celery==3.2.1 -future>=0.17,<0.18 -pillow>=5.0,<6.0 +Celery==4.2.0 +future==0.17.1 +Pillow==5.4.1 attrs>=17.4.0 \ No newline at end of file From 88cc56c48f0c59cc5d4d6cb1466cd77c34d62c99 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 15 Jun 2019 12:36:43 +0300 Subject: [PATCH 09/11] specify all versions --- requirements.txt | 84 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index a7aa3fb1..5c9c27fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,78 @@ -Django>=2 -Scrapy>=1.6 -scrapy-djangoitem==1.1.1 -scrapy-splash==0.7 -scrapyd==1.2 -jsonpath-rw>=1.4 -Celery==4.2.0 +amqp==2.5.0 +apt-xapian-index==0.47 +asn1crypto==0.24.0 +attrs==19.1.0 +Automat==0.7.0 +billiard==3.5.0.5 +celery==4.2.0 +certifi==2018.8.24 +cffi==1.12.3 +chardet==3.0.4 +command-not-found==0.3 +constantly==15.1.0 +cryptography==2.7 +cssselect==1.0.3 +cupshelpers==1.0 +decorator==4.4.0 +distro-info==0.18 +Django==2.2.2 +entrypoints==0.2.3.post3 future==0.17.1 +galternatives==1.0.1 +httplib2==0.11.3 +hyperlink==19.0.0 +idna==2.8 +incremental==17.5.0 +jsonpath-rw==1.4.0 +keyring==15.1.0 +keyrings.alt==3.1 +kombu==4.6.3 +language-selector==0.1 +lxml==4.3.4 +netifaces==0.10.4 +olefile==0.46 +parsel==1.5.1 +pexpect==4.6.0 Pillow==5.4.1 -attrs>=17.4.0 \ No newline at end of file +ply==3.11 +pyasn1==0.4.5 +pyasn1-modules==0.2.5 +pycairo==1.16.2 +pycparser==2.19 +pycrypto==2.6.1 +pycups==1.9.73 +PyDispatcher==2.0.5 +PyGObject==3.30.1 +PyHamcrest==1.9.0 +pyOpenSSL==19.0.0 +python-apt==1.7.0 +python-debian==0.1.33 +pytz==2019.1 +pyxdg==0.25 +PyYAML==3.12 +queuelib==1.5.0 +reportlab==3.5.6 +requests==2.18.4 +requests-unixsocket==0.1.5 +Scrapy==1.6.0 +scrapy-djangoitem==1.1.1 +scrapy-splash==0.7 +scrapyd==1.2.0 +SecretStorage==2.3.1 +service-identity==18.1.0 +six==1.12.0 +sqlparse==0.3.0 +ssh-import-id==5.7 +systemd-python==234 +Twisted==19.2.1 +ubuntu-drivers-common==0.0.0 +ufw==0.35 +unattended-upgrades==0.1 +urllib3==1.22 +usb-creator==0.3.3 +vboxapi==1.0 +vine==1.3.0 +virtualenv==15.1.0 +w3lib==1.20.0 +xkit==0.0.0 +zope.interface==4.6.0 From 72a9013fcd896a18a782b42a44b0e61f56487b99 Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 15 Jun 2019 14:16:35 +0300 Subject: [PATCH 10/11] specify all versions --- requirements.txt | 84 +++++------------------------------------------- 1 file changed, 8 insertions(+), 76 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5c9c27fa..a7aa3fb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,78 +1,10 @@ -amqp==2.5.0 -apt-xapian-index==0.47 -asn1crypto==0.24.0 -attrs==19.1.0 -Automat==0.7.0 -billiard==3.5.0.5 -celery==4.2.0 -certifi==2018.8.24 -cffi==1.12.3 -chardet==3.0.4 -command-not-found==0.3 -constantly==15.1.0 -cryptography==2.7 -cssselect==1.0.3 -cupshelpers==1.0 -decorator==4.4.0 -distro-info==0.18 -Django==2.2.2 -entrypoints==0.2.3.post3 -future==0.17.1 -galternatives==1.0.1 -httplib2==0.11.3 -hyperlink==19.0.0 -idna==2.8 -incremental==17.5.0 -jsonpath-rw==1.4.0 -keyring==15.1.0 -keyrings.alt==3.1 -kombu==4.6.3 -language-selector==0.1 -lxml==4.3.4 -netifaces==0.10.4 -olefile==0.46 -parsel==1.5.1 -pexpect==4.6.0 -Pillow==5.4.1 -ply==3.11 -pyasn1==0.4.5 -pyasn1-modules==0.2.5 -pycairo==1.16.2 -pycparser==2.19 -pycrypto==2.6.1 -pycups==1.9.73 -PyDispatcher==2.0.5 -PyGObject==3.30.1 -PyHamcrest==1.9.0 -pyOpenSSL==19.0.0 -python-apt==1.7.0 -python-debian==0.1.33 -pytz==2019.1 -pyxdg==0.25 -PyYAML==3.12 -queuelib==1.5.0 -reportlab==3.5.6 -requests==2.18.4 -requests-unixsocket==0.1.5 -Scrapy==1.6.0 +Django>=2 +Scrapy>=1.6 scrapy-djangoitem==1.1.1 scrapy-splash==0.7 -scrapyd==1.2.0 -SecretStorage==2.3.1 -service-identity==18.1.0 -six==1.12.0 -sqlparse==0.3.0 -ssh-import-id==5.7 -systemd-python==234 -Twisted==19.2.1 -ubuntu-drivers-common==0.0.0 -ufw==0.35 -unattended-upgrades==0.1 -urllib3==1.22 -usb-creator==0.3.3 -vboxapi==1.0 -vine==1.3.0 -virtualenv==15.1.0 -w3lib==1.20.0 -xkit==0.0.0 -zope.interface==4.6.0 +scrapyd==1.2 +jsonpath-rw>=1.4 +Celery==4.2.0 +future==0.17.1 +Pillow==5.4.1 +attrs>=17.4.0 \ No newline at end of file From c07f0e957df29b1d6a9437ba5fe69cf7245b99df Mon Sep 17 00:00:00 2001 From: banana Date: Sat, 15 Jun 2019 15:25:50 +0300 Subject: [PATCH 11/11] add table space --- dynamic_scraper/migrations/0001_initial.py | 7 ++++++- dynamic_scraper/models.py | 17 ++++++----------- setup.py | 2 +- tests/settings/base_settings.py | 1 + 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/dynamic_scraper/migrations/0001_initial.py b/dynamic_scraper/migrations/0001_initial.py index fb87eec1..160cdc14 100644 --- a/dynamic_scraper/migrations/0001_initial.py +++ b/dynamic_scraper/migrations/0001_initial.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.11.20 on 2019-05-25 06:05 +# Generated by Django 1.11.20 on 2019-06-15 07:23 from __future__ import unicode_literals import datetime @@ -39,6 +39,7 @@ class Migration(migrations.Migration): ], options={ 'ordering': ['-date'], + 'db_tablespace': 'tables', }, ), migrations.CreateModel( @@ -81,6 +82,7 @@ class Migration(migrations.Migration): ], options={ 'ordering': ['next_action_time'], + 'db_tablespace': 'tables', }, ), migrations.CreateModel( @@ -95,6 +97,7 @@ class Migration(migrations.Migration): ], options={ 'ordering': ['order'], + 'db_tablespace': 'tables', }, ), migrations.CreateModel( @@ -110,6 +113,7 @@ class Migration(migrations.Migration): 'verbose_name': 'Scraped object class', 'verbose_name_plural': 'Scraped object classes', 'ordering': ['name'], + 'db_tablespace': 'tables', }, ), migrations.CreateModel( @@ -140,6 +144,7 @@ class Migration(migrations.Migration): ], options={ 'ordering': ['name', 'scraped_obj_class'], + 'db_tablespace': 'tables', }, ), migrations.CreateModel( diff --git a/dynamic_scraper/models.py b/dynamic_scraper/models.py index 2bbfb4be..1e9805a2 100644 --- a/dynamic_scraper/models.py +++ b/dynamic_scraper/models.py @@ -1,6 +1,4 @@ -#Stage 2 Update (Python 3) from __future__ import unicode_literals -from django.utils.encoding import python_2_unicode_compatible from builtins import range from builtins import str from builtins import object @@ -9,7 +7,6 @@ from django.db.models import Q -@python_2_unicode_compatible class ScrapedObjClass(models.Model): name = models.CharField(max_length=200) scraper_scheduler_conf = models.TextField(default='\ @@ -30,12 +27,12 @@ def __str__(self): return self.name class Meta(object): + db_tablespace = "tables" verbose_name = "Scraped object class" verbose_name_plural = "Scraped object classes" ordering = ['name',] -@python_2_unicode_compatible class ScrapedObjAttr(models.Model): ATTR_TYPE_CHOICES = ( ('S', 'STANDARD'), @@ -55,10 +52,10 @@ def __str__(self): return self.name + " (" + str(self.obj_class) + ")" class Meta(object): + db_tablespace = "tables" ordering = ['order',] -@python_2_unicode_compatible class Scraper(models.Model): STATUS_CHOICES = ( ('A', 'ACTIVE'), @@ -167,7 +164,7 @@ def get_follow_page_rpts(self): return self.requestpagetype_set.filter(page_type='FP') def get_detail_page_rpts(self): - return s.requestpagetype_set.filter(~Q(page_type='MP')) + return self.requestpagetype_set.filter(~Q(page_type='MP')) def get_rpt(self, page_type): return self.requestpagetype_set.get(page_type=page_type) @@ -232,10 +229,10 @@ def __str__(self): return self.name + " (" + self.scraped_obj_class.name + ")" class Meta(object): + db_tablespace = "tables" ordering = ['name', 'scraped_obj_class',] -@python_2_unicode_compatible class RequestPageType(models.Model): TYPE_CHOICES = tuple([("MP", "Main Page"), ("FP", "Follow Page"),] + [("DP{n}".format(n=str(n)), "Detail Page {n}".format(n=str(n))) for n in list(range(1, 26))]) CONTENT_TYPE_CHOICES = ( @@ -274,7 +271,6 @@ def __str__(self): return ret_str -@python_2_unicode_compatible class Checker(models.Model): CHECKER_TYPE = ( ('4', '404'), @@ -293,7 +289,6 @@ def __str__(self): return str(self.scraped_obj_attr) + ' > ' + self.get_checker_type_display() -@python_2_unicode_compatible class ScraperElem(models.Model): REQUEST_PAGE_TYPE_CHOICES = tuple([("MP", "Main Page")] + [("DP{n}".format(n=str(n)), "Detail Page {n}".format(n=str(n))) for n in list(range(1, 26))]) help_text = "The different attributes to be scraped, exactly one attribute of type BASE necessary." @@ -322,8 +317,6 @@ class Meta(object): ordering = ['scraped_obj_attr__order',] - -@python_2_unicode_compatible class SchedulerRuntime(models.Model): TYPE = ( ('S', 'SCRAPER'), @@ -338,6 +331,7 @@ def __str__(self): return str(self.id) class Meta(object): + db_tablespace = "tables" ordering = ['next_action_time',] @@ -385,4 +379,5 @@ def numeric_level(level): return numeric_level class Meta(object): + db_tablespace = "tables" ordering = ['-date'] diff --git a/setup.py b/setup.py index 51dd86a1..9d477022 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='django-dynamic-scraper', - version='0.13.2', + version='0.13.3', description='Creating Scrapy scrapers via the Django admin interface', author='Holger Drewes', author_email='Holger.Drewes@gmail.com', diff --git a/tests/settings/base_settings.py b/tests/settings/base_settings.py index a3797b0e..c11d6c03 100644 --- a/tests/settings/base_settings.py +++ b/tests/settings/base_settings.py @@ -2,6 +2,7 @@ # Scrapy settings for unit tests import os, sys + PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) sys.path.insert(0, os.path.join(PROJECT_ROOT, "../.."))