-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
111 lines (95 loc) · 2.8 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package main
import (
"os"
"path/filepath"
"strings"
"sync"
"github.com/spf13/cobra"
)
var (
tasks chan string
wg sync.WaitGroup
visited = make(map[string]bool)
visitedMu sync.Mutex
pages = make(map[string]Page)
pagesMu sync.Mutex
)
func worker(logger *Logger, opts Options) {
for urlStr := range tasks {
fetchPage(urlStr, logger, opts)
wg.Done()
}
}
func main() {
rootCmd := &cobra.Command{
Use: "sitefetch [url]",
Short: "Fetch a site and extract its readable content as Markdown",
Args: cobra.MinimumNArgs(1),
Run: func(cmd *cobra.Command, args []string) {
outfile, _ := cmd.Flags().GetString("outfile")
concurrency, _ := cmd.Flags().GetInt("concurrency")
matchFlag, _ := cmd.Flags().GetString("match")
contentSelector, _ := cmd.Flags().GetString("content-selector")
limit, _ := cmd.Flags().GetInt("limit")
silent, _ := cmd.Flags().GetBool("silent")
siteURL := args[0]
var matches []string
if matchFlag != "" {
parts := strings.Split(matchFlag, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
matches = append(matches, p)
}
}
}
opts := Options{
Concurrency: concurrency,
Matches: matches,
ContentSelector: contentSelector,
Limit: limit,
Silent: silent,
}
logger := &Logger{silent: silent}
tasks = make(chan string, 100)
for i := 0; i < concurrency; i++ {
go worker(logger, opts)
}
logger.Info("Started fetching ", siteURL, " with a concurrency of ", concurrency)
enqueue(siteURL, true, opts, logger)
wg.Wait()
close(tasks)
totalTokens := 0
pagesMu.Lock()
for _, page := range pages {
totalTokens += countTokens(page.Content)
}
count := len(pages)
pagesMu.Unlock()
logger.Info("Total token count for ", count, " pages: ", formatNumber(totalTokens))
output := serializePages(pages)
if outfile != "" {
if err := os.MkdirAll(filepath.Dir(outfile), os.ModePerm); err != nil {
logger.Warn("Failed to create directory:", err)
return
}
if err := os.WriteFile(outfile, []byte(output), 0644); err != nil {
logger.Warn("Failed to write file:", err)
return
}
} else {
// fmt.Println(output)
}
},
}
// Define command-line flags using Cobra.
rootCmd.Flags().String("outfile", "", "Write the fetched site to a text file")
rootCmd.Flags().Int("concurrency", 3, "Number of concurrent requests")
rootCmd.Flags().String("match", "", "Only fetch matched pages (comma separated)")
rootCmd.Flags().String("content-selector", "", "The CSS selector to find content")
rootCmd.Flags().Int("limit", 0, "Limit the result to this number of pages")
rootCmd.Flags().Bool("silent", false, "Do not print any logs")
if err := rootCmd.Execute(); err != nil {
os.Exit(1)
}
}