forked from amccollum/microtron
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck.py
executable file
·48 lines (37 loc) · 1.36 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#! /usr/bin/env python
# -*- coding: utf-8 -*-
""" util to check microformat data (not quite validation ;-) """
import lxml.etree, lxml.html
from optparse import OptionParser
import os
import pprint
import sys
from microtron import *
def parse(argv = None):
if argv is None:
argv = sys.argv
parser = OptionParser('usage: %prog <url> <format>')
parser.add_option("-s", "--strict",
action="store_true", dest="strict", default=False,
help="be strict about parsing")
options, arguments = parser.parse_args(argv[1:])
if len(arguments) != 2:
parser.error('Incorrect number of arguments')
url = arguments[0]
format = arguments[1]
tree = lxml.html.parse( url )
parser = Parser( tree, strict=True, collect_errors=True )
data = parser.parse_format(format)
# pprint.pprint(data)
print "%d errors:" % (len( parser.errors ) )
errs = parser.errors
errs.sort(lambda x, y: cmp(x.sourceline,y.sourceline))
for err in errs:
print "ERROR (line %d): %s" % (err.sourceline, err)
# TODO: extra checks for hnews:
# - warn if dates insane (future, or distant past)
# - updated but no published
# - concatenated authors in single vcard ("Bob Smith and Fred Bloggs")
# - insanity in content (eg adverts, scripts....)
if __name__ == '__main__':
sys.exit(parse())