This repository has been archived by the owner on Nov 25, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfbstream.d
347 lines (317 loc) · 8.16 KB
/
fbstream.d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
/**
* Fb2RSS is a translator from the HTML structure generated by Facebook to
* an atom feed.
*
* The page is formatted like this:
* $(UL
* $(LI The relevant data is inside `<code></code>` blocks)
* $(LI Inside these blocks is further HTML-Data, which is commented out.)
* $(LI The posting and metadata is inside a `<div></div>`, which has the date-time attribute set.)
* $(LI The actual text to the post is inside another `<div></div>`, with class="_5pbx userContent")
* $(LI The link to the Post is inside the href of `<a></a>` with class="_5pcq")
* )
*
* Authors: Dominik Schmidt, [email protected]
*
* License:
* Copyright (C) 2015 Dominik Schmidt <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
module fbstream;
import std.net.curl;
import std.stdio;
import std.string;
import std.datetime : SysTime, unixTimeToStdTime;
import std.range;
import std.file;
import std.utf;
import drss.rss;
import drss.render;
import kxml.xml;
import std.typecons;
import std.conv;
import std.json;
string getCookiePath(){
import std.path;
import standardpaths;
string base=writablePath(StandardPath.config);
return buildPath(base, "Fb2RSS_cookiejar.txt");
}
class CaptchaException : Exception{
this(string msg, string file=__FILE__, size_t line=__LINE__, Throwable next=null){
super(msg,file,line,next);
}
override string toString(){
return msg;
}
}
JSONValue search(JSONValue tree, string id){
with(JSONType)
switch(tree.type){
case object:
auto o = tree.object;
if(id in o){
return o[id];
}
foreach(v; o.byValue){
auto nv= search(v, id);
if(nv.type != null_){
return nv;
}
}
break;
case array:
foreach(v; tree.array){
auto nv= search(v, id);
if(nv.type != null_){
return nv;
}
}
break;
default:
return JSONValue.init;
break;
}
return JSONValue.init;
}
/**
* Manages all the relevant tasks of
* $(UL
* $(LI Fetching)
* $(LI Parsing)
* $(LI Formatting and Outputting)
* )
*/
class FBStream : DRSS!(Post){
///Holds the url, where we get the data from. Can either be an URL or a filename.
private string fetch_url;
///The plaintext string holding the whole file
char[] document;
DRSS_Header[] headers=[Tuple!(string,string)("url",null),Tuple!(string,string)("title",null)];
/**
* The useragent to use for requesting the page with facebook.
* Facebook does check this, and if it doesn't know it, it displays an
* "Update your Browser"-Message
*/
static string userAgent="curl/7.72.0";
///The RSS-Header to append.
static string rss_header=`<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`;
immutable string url;
/**
* Params: fetch_url = Fetch the Data from this source
*/
this(string fetch_url){
auto h=HTTP();
h.url=fetch_url;
h.setUserAgent(userAgent);
date_reliability=DateReliable.YES;
url=fetch_url;
//h.setCookieJar(getCookiePath());
super(h);
}
/**
* Returns wether the page in buf is already unlocked.
*
* Params:
* buf = The chararray of the page.
* Returns: True if the page is unlocked, false otherwise
*/
static bool captchaSolved(in char[] buf){
import std.algorithm.searching : canFind;
return !canFind(buf, "tfbimage.php?captcha_challenge_code");
}
/**
* Parses the document.
*
* Params:
* document = The documentstring to parse.
*/
override public void parse(string document){
XmlNode[] arr;
XmlNode root;
root=readDocument(document);
if(!captchaSolved(document)){
throw new CaptchaException("Captcha has not been solved yet. "
~"Please run the ./captcha utility");
}
arr=root.parseXPath(`//title`);
headers[1][1]=arr[0].getCData().idup;
headers[0][1]=url;
XmlNode[] nodes=root.parseXPath(`//div[@id="recent"]`);
assert(nodes.length>0, "No data nodes found!");
nodes = nodes[0].getChildren()[0].getChildren()[0].getChildren();
foreach(node; nodes.retro){
appendPost(node);
}
}
/**
* Gets the information from the data-div and appends it to #posts
* Params: match = The data-div node
*/
private void appendPost(XmlNode match){
XmlNode usercontent;
try{
usercontent=match.parseXPath(`//div[@style]`)[0];
}
catch(Exception e){
return;
}
SysTime t=getPostTimestamp(match);
XmlNode[] href=match.parseXPath(`//a`);
string hrefs;
if(href.length!=0){
hrefs=href[$-1].getAttribute("href");
/*
import std.regex;
auto re = ctRegex!"[^?]+";
auto m = hrefs.matchFirst(re);
if(m){
hrefs = m[0];
}
*/
}
assert(hrefs.length>0);
addEntry(Post(usercontent,t,hrefs));
}
/**
* Gets the timestamp of a post
*
*/
private SysTime getPostTimestamp(XmlNode post){
import std.json;
auto attr = post.getAttribute("data-ft");
auto json = parseJSON(attr);
auto m = search(json, "publish_time");
return SysTime(unixTimeToStdTime(m.integer));
}
/**
* Fetches the raw-data, either from File or from URL
*/
public override bool fetch(){
if(exists(url) && isFile(url)){
buffer=cast(ubyte[])read(url);
return true;
}
else{
return super.fetch();
}
}
/**
* Generates the RSS-file
*
* Params:
* f = the file to write the RSS-Document to.
*/
void writeRSS(File f){
import drss.render;
XmlNode n=generateRSS(this,headers);
f.writeln(rss_header);
f.writeln(n);
}
}
///
struct Post{
///The userdata `<div></div>`
XmlNode content;
///The modification date
SysTime time;
///The Post-href
string href;
///The count of characters, until the title gets cut off.
static ushort title_cutoff=80;
static string plaintext(XmlNode n){
Appender!string app = Appender!string();
plaintext(n, app);
return app.data;
}
static void plaintext(XmlNode n, ref Appender!string app){
app~=n.getCData();
foreach(c; n.getChildren){
plaintext(c, app);
}
}
/**
* Return: The title of the posting
* Bugs: title_cutoff is reached with fewer characters when there are
* a lot of multibyte characters in the string.
*/
@property string title(){
string cont=plaintext(content);
if(cont.length>title_cutoff){
cont=cont[0..toUTFindex(cont,title_cutoff)];
cont~="...";
}
return cont;
}
///Returns: The link to the post.
@property string link() const{
return "https://facebook.com"~href;
}
/**
* Returns: An unique id to the post
* Bugs: It should be something sensible here, not just the link.
* Optimally, it should be the same as the facebookfeed read.
*/
@property string id() const{
return link();
}
/// Returns: The Atom-valid datestring
@property string ISOTime() const{
return time.toISOExtString();
}
/// Returns: An UCData-Object describing the content of the post.
@property UCData getUCContent(){
UCData uc=new UCData();
uc.setCData(content.toString());
return uc;
}
/**
* Compares the object with b by comparing the dates
* Returns: -1 if b is bigger, 1 if b is smaller, 0 if they're equal
*/
int opCmp(in ref Post b) const{
if(time<b.time){
return -1;
}
else if(time>b.time){
return 1;
}
else{
return 0;
}
}
/**
* Generates an Atom-Entry matching the post
* Returns: The Entry-Node for inclusion inside the Atom-Feed.
*/
XmlNode toXML(){
XmlNode e=new XmlNode("entry");
e.addChild(new XmlNode("title").addCData(title));
e.addChild(new XmlNode("link").setAttribute("href",link));
e.addChild(new XmlNode("id").addCData(id));
e.addChild(new XmlNode("published").addCData(ISOTime()));
e.addChild(new XmlNode("content").setAttribute("type","html").addChild(getUCContent()));
return e;
}
///
bool opEquals(in ref Post b) const{
return (opCmp(b)==0);
}
///
bool opEquals(in Post b) const{
return (opCmp(b)==0);
}
}