-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathfb_ads_library_api.py
executable file
·173 lines (160 loc) · 6.19 KB
/
fb_ads_library_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import json
import re
from datetime import datetime
import requests
def get_ad_archive_id(data):
"""
Extract ad_archive_id from ad_snapshot_url
"""
return re.search(r"/\?id=([0-9]+)", data["ad_snapshot_url"]).group(1)
class FbAdsLibraryTraversal:
default_url_pattern = (
"https://graph.facebook.com/{}/ads_archive?access_token={}&"
+ "fields={}&search_terms={}&ad_reached_countries={}"
)
default_api_version = "v21.0"
def __init__(
self,
access_token,
fields,
search_term,
country,
search_page_ids="",
ad_active_status="ALL",
before_date=None,
after_date="2000-01-01",
ad_type="ALL",
bylines=None,
delivery_by_region=None,
estimated_audience_size_max=None,
estimated_audience_size_min=None,
languages=None,
media_type="ALL",
publisher_platforms=None,
search_type="KEYWORD_UNORDERED",
unmask_removed_content=False,
page_limit=500,
api_version=None,
retry_limit=3,
):
self.page_count = 0
self.access_token = access_token
self.fields = fields
if "ad_delivery_start_time" not in self.fields:
self.fields += ",ad_delivery_start_time"
self.search_term = search_term
self.country = country
self.search_page_ids = search_page_ids
self.ad_active_status = ad_active_status
self.before_date = before_date
self.after_date = after_date
self.ad_type = ad_type
self.bylines = bylines
self.delivery_by_region = delivery_by_region
self.estimated_audience_size_max = estimated_audience_size_max
self.estimated_audience_size_min = estimated_audience_size_min
self.languages = languages
self.media_type = media_type
self.publisher_platforms = publisher_platforms
self.search_type = search_type
self.unmask_removed_content = unmask_removed_content
self.page_limit = page_limit
self.retry_limit = retry_limit
if api_version is None:
self.api_version = self.default_api_version
else:
self.api_version = api_version
def generate_ad_archives(self):
base_url = self.default_url_pattern.format(
self.api_version,
self.access_token,
self.fields,
self.search_term,
self.country,
)
optional_params = {
"search_page_ids": self.search_page_ids,
"ad_active_status": self.ad_active_status,
"ad_delivery_date_max": self.before_date,
"ad_delivery_date_min": self.after_date if self.after_date != "2000-01-01" else None,
"ad_type": self.ad_type,
"bylines": self.bylines,
"delivery_by_region": self.delivery_by_region,
"estimated_audience_size_max": self.estimated_audience_size_max,
"estimated_audience_size_min": self.estimated_audience_size_min,
"languages": self.languages,
"media_type": self.media_type,
"publisher_platforms": self.publisher_platforms,
"search_type": self.search_type,
"unmask_removed_content": self.unmask_removed_content,
"limit": self.page_limit,
}
optional_params_str = "&".join(
f"{key}={value}" for key, value in optional_params.items() if value
)
next_page_url = f"{base_url}&{optional_params_str}"
return self.__class__._get_ad_archives_from_url(
next_page_url, after_date=self.after_date, retry_limit=self.retry_limit
)
@staticmethod
def _get_ad_archives_from_url(
next_page_url, after_date="2000-01-01", retry_limit=3
):
last_error_url = None
last_retry_count = 0
print("after_date: ", after_date)
print("next_page_url: ", next_page_url)
start_time_cutoff_after = datetime.strptime(after_date, "%Y-%m-%d")
print("start_time_cutoff_after: ", start_time_cutoff_after)
start_time_cutoff_after = start_time_cutoff_after.timestamp()
while next_page_url is not None:
response = requests.get(next_page_url)
response_data = json.loads(response.text)
print("response_data: ", response_data)
if "error" in response_data:
if next_page_url == last_error_url:
# failed again
if last_retry_count >= retry_limit:
raise Exception(
"Error message: [{}], failed on URL: [{}]".format(
json.dumps(response_data["error"]), next_page_url
)
)
else:
last_error_url = next_page_url
last_retry_count = 0
last_retry_count += 1
continue
filtered = list(
filter(
lambda ad_archive: ("ad_delivery_start_time" in ad_archive)
and (
datetime.strptime(
ad_archive["ad_delivery_start_time"], "%Y-%m-%d"
).timestamp()
>= start_time_cutoff_after
),
response_data["data"],
)
)
if len(filtered) == 0:
# if no data after the after_date, break
next_page_url = None
break
yield filtered
if "paging" in response_data:
next_page_url = response_data["paging"]["next"]
else:
next_page_url = None
@classmethod
def generate_ad_archives_from_url(cls, failure_url, after_date="2000-01-01"):
"""
if we failed from error, later we can just continue from the last failure url
"""
return cls._get_ad_archives_from_url(failure_url, after_date=after_date)