Skip to content
This repository was archived by the owner on Oct 31, 2019. It is now read-only.

Commit 263a0e7

Browse files
committed
initial commit
First commit for initial xquery project.
0 parents  commit 263a0e7

8 files changed

+664
-0
lines changed

.gitignore

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# vscode
2+
.vscode
3+
debug
4+
*.test
5+
6+
./build
7+
8+
# Compiled Object files, Static and Dynamic libs (Shared Objects)
9+
*.o
10+
*.a
11+
*.so
12+
13+
14+
# Folders
15+
_obj
16+
_test
17+
18+
# Architecture specific extensions/prefixes
19+
*.[568vq]
20+
[568vq].out
21+
22+
*.cgo1.go
23+
*.cgo2.c
24+
_cgo_defun.c
25+
_cgo_gotypes.go
26+
_cgo_export.*
27+
28+
_testmain.go
29+
30+
*.exe
31+
*.test
32+
*.prof

LICENSE

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
Permission is hereby granted, free of charge, to any person obtaining a copy
2+
of this software and associated documentation files (the "Software"), to deal
3+
in the Software without restriction, including without limitation the rights
4+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5+
copies of the Software, and to permit persons to whom the Software is
6+
furnished to do so, subject to the following conditions:
7+
8+
The above copyright notice and this permission notice shall be included in
9+
all copies or substantial portions of the Software.
10+
11+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17+
THE SOFTWARE.

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
XQuery
2+
====
3+
XQuery is a package to extract data from HTML and XML using XPath selectors.
4+
5+
Installing
6+
====

html/select.go

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
package html
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
7+
"github.com/antchfx/gxpath"
8+
"github.com/antchfx/gxpath/xpath"
9+
"golang.org/x/net/html"
10+
)
11+
12+
// Selector is a XPath selector for HTML.
13+
type Selector struct{}
14+
15+
func (s *Selector) Find(top *html.Node, expr string) []*html.Node {
16+
nav := &htmlNodeNavigator{curr: top, root: top, attr: -1}
17+
var elems []*html.Node
18+
t := s.Select(nav, expr)
19+
for t.MoveNext() {
20+
elems = append(elems, (t.Current().(*htmlNodeNavigator)).curr)
21+
}
22+
return elems
23+
}
24+
25+
func (s *Selector) FindOne(top *html.Node, expr string) *html.Node {
26+
nav := &htmlNodeNavigator{curr: top, root: top, attr: -1}
27+
t := s.Select(nav, expr)
28+
var elem *html.Node
29+
if t.MoveNext() {
30+
elem = (t.Current().(*htmlNodeNavigator)).curr
31+
}
32+
return elem
33+
}
34+
35+
func (s *Selector) Select(root xpath.NodeNavigator, expr string) *gxpath.NodeIterator {
36+
return gxpath.Select(root, expr)
37+
}
38+
39+
type htmlNodeNavigator struct {
40+
root, curr *html.Node
41+
attr int
42+
}
43+
44+
func (h *htmlNodeNavigator) NodeType() xpath.NodeType {
45+
switch h.curr.Type {
46+
case html.CommentNode:
47+
return xpath.CommentNode
48+
case html.TextNode:
49+
return xpath.TextNode
50+
case html.DocumentNode:
51+
return xpath.RootNode
52+
case html.ElementNode:
53+
if h.attr != -1 {
54+
return xpath.AttributeNode
55+
}
56+
return xpath.ElementNode
57+
}
58+
panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
59+
}
60+
61+
func (h *htmlNodeNavigator) LocalName() string {
62+
if h.attr != -1 {
63+
return h.curr.Attr[h.attr].Key
64+
}
65+
return h.curr.Data
66+
}
67+
68+
func (*htmlNodeNavigator) Prefix() string {
69+
return ""
70+
}
71+
72+
func (h *htmlNodeNavigator) Value() string {
73+
switch h.curr.Type {
74+
case html.CommentNode:
75+
return h.curr.Data
76+
case html.ElementNode:
77+
if h.attr != -1 {
78+
return h.curr.Attr[h.attr].Val
79+
}
80+
return InnerText(h.curr)
81+
case html.TextNode:
82+
return h.curr.Data
83+
}
84+
return ""
85+
}
86+
87+
func (h *htmlNodeNavigator) Copy() xpath.NodeNavigator {
88+
n := *h
89+
return &n
90+
}
91+
92+
func (h *htmlNodeNavigator) MoveToRoot() {
93+
h.curr = h.root
94+
}
95+
96+
func (h *htmlNodeNavigator) MoveToParent() bool {
97+
if node := h.curr.Parent; node != nil {
98+
h.curr = node
99+
return true
100+
}
101+
return false
102+
}
103+
104+
func (h *htmlNodeNavigator) MoveToNextAttribute() bool {
105+
if h.attr >= len(h.curr.Attr)-1 {
106+
return false
107+
}
108+
h.attr++
109+
return true
110+
}
111+
112+
func (h *htmlNodeNavigator) MoveToChild() bool {
113+
if node := h.curr.FirstChild; node != nil {
114+
h.curr = node
115+
return true
116+
}
117+
return false
118+
}
119+
120+
func (h *htmlNodeNavigator) MoveToFirst() bool {
121+
if h.curr.PrevSibling == nil {
122+
return false
123+
}
124+
for {
125+
node := h.curr.PrevSibling
126+
if node == nil {
127+
break
128+
}
129+
h.curr = node
130+
}
131+
return true
132+
}
133+
134+
func (h *htmlNodeNavigator) String() string {
135+
return h.Value()
136+
}
137+
138+
func (h *htmlNodeNavigator) MoveToNext() bool {
139+
if node := h.curr.NextSibling; node != nil {
140+
h.curr = node
141+
return true
142+
}
143+
return false
144+
}
145+
146+
func (h *htmlNodeNavigator) MoveToPrevious() bool {
147+
if node := h.curr.PrevSibling; node != nil {
148+
h.curr = node
149+
return true
150+
}
151+
return false
152+
}
153+
154+
func (h *htmlNodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
155+
node, ok := other.(*htmlNodeNavigator)
156+
if !ok || node.root != h.root {
157+
return false
158+
}
159+
160+
h.curr = node.curr
161+
h.attr = node.attr
162+
return true
163+
}
164+
165+
// InnerText returns the text between the start and end tags of the object.
166+
func InnerText(n *html.Node) string {
167+
if n.Type == html.TextNode || n.Type == html.CommentNode {
168+
return n.Data
169+
}
170+
var buf bytes.Buffer
171+
for child := n.FirstChild; child != nil; child = child.NextSibling {
172+
buf.WriteString(InnerText(child))
173+
}
174+
return buf.String()
175+
}

html/select_test.go

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package html
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"golang.org/x/net/html"
8+
)
9+
10+
var (
11+
doc = loadHtml()
12+
sel = &Selector{}
13+
)
14+
15+
func TestXPathSelect(t *testing.T) {
16+
if node := sel.FindOne(doc, "/html/head/title"); node == nil {
17+
t.Fatal("cannot found any node")
18+
}
19+
if node := sel.FindOne(doc, "//body[@bgcolor]"); node.Attr[0].Val != "ffffff" {
20+
t.Fatal("body bgcolor is not #ffffff")
21+
}
22+
if list := sel.Find(doc, "//a"); len(list) != 2 {
23+
t.Fatal("count(//a)!=2")
24+
}
25+
if list := sel.Find(doc, "//body/child::*"); len(list) != 9 { // ignored textnode
26+
t.Fatal("count(//body/child::*)!=9")
27+
}
28+
}
29+
30+
func TestInnerText(t *testing.T) {
31+
title := sel.FindOne(doc, "//title")
32+
if txt := InnerText(title); strings.TrimSpace(txt) != "your title here" {
33+
t.Fatalf("InnerText(//title): %s !=your title here", txt)
34+
}
35+
head := sel.FindOne(doc, "/html/head")
36+
if txt := InnerText(head); strings.TrimSpace(txt) != "your title here" {
37+
t.Fatalf("InnerText(/html/head): %s !=your title here", txt)
38+
}
39+
}
40+
41+
func loadHtml() *html.Node {
42+
// http://help.websiteos.com/websiteos/example_of_a_simple_html_page.htm
43+
var str = `<html>
44+
<head>
45+
<title>your title here</title>
46+
</head>
47+
<body bgcolor="ffffff">
48+
<center><img src="clouds.jpg" align="bottom"> </center>
49+
<hr>
50+
<a href="http://somegreatsite.com">link name</a>
51+
is a link to another nifty site
52+
<h1>this is a header</h1>
53+
<h2>this is a medium header</h2>
54+
send me mail at <a href="mailto:[email protected]">[email protected]</a>.
55+
<p> this is a new paragraph!
56+
<p> <b>this is a new paragraph!</b>
57+
<br> <b><i>this is a new sentence without a paragraph break, in bold italics.</i></b>
58+
<hr>
59+
</body>
60+
</html`
61+
node, err := html.Parse(strings.NewReader(str))
62+
if err != nil {
63+
panic(err)
64+
}
65+
return node
66+
}

query.go

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package xquery
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/antchfx/gxpath"
7+
"github.com/antchfx/gxpath/xpath"
8+
"github.com/antchfx/xquery/html"
9+
"github.com/antchfx/xquery/xml"
10+
)
11+
12+
// Type represents the documents types for XPath.
13+
type Type int
14+
15+
const (
16+
HTML Type = iota
17+
XML
18+
)
19+
20+
// Selector is an interface for XPath search.
21+
type Selector interface {
22+
Select(xpath.NodeNavigator, string) *gxpath.NodeIterator
23+
}
24+
25+
// New returns new Selector for the specified documents type.
26+
func New(typ Type) Selector {
27+
switch typ {
28+
case HTML:
29+
return &html.Selector{}
30+
case XML:
31+
return &xml.Selector{}
32+
}
33+
panic(fmt.Errorf("unknown type : %d", typ))
34+
}

0 commit comments

Comments
 (0)