Skip to content
This repository was archived by the owner on Jan 13, 2023. It is now read-only.

Scrape Everything From Github #164

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Gitrob is a tool to help find potentially sensitive files pushed to public repos
Suppress all output except for errors
-threads int
Number of concurrent threads (default number of logical CPUs)
-gather-all
Specify whether to pull all repositories from the domain
```

### Saving session to a file
Expand Down
12 changes: 8 additions & 4 deletions core/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package core
import (
"fmt"
"io/ioutil"

"gopkg.in/src-d/go-git.v4"
"gopkg.in/src-d/go-git.v4/plumbing"
"gopkg.in/src-d/go-git.v4/plumbing/object"
Expand All @@ -14,9 +13,14 @@ const (
EmptyTreeCommitId = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
)

func CloneRepository(url *string, branch *string, depth int) (*git.Repository, string, error) {
urlVal := *url
branchVal := *branch
func CloneRepository(repo *GithubRepository, depth int) (*git.Repository, string, error) {
var urlVal string
if repo.CloneURL != nil {
urlVal = *repo.CloneURL
} else {
urlVal = *repo.URL
}
branchVal := *repo.DefaultBranch
dir, err := ioutil.TempDir("", "gitrob")
if err != nil {
return nil, "", err
Expand Down
88 changes: 87 additions & 1 deletion core/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package core

import (
"context"

"github.com/google/go-github/github"
)

Expand Down Expand Up @@ -111,3 +110,90 @@ func GetOrganizationMembers(login *string, client *github.Client) ([]*GithubOwne
}
return allMembers, nil
}

func DetermineRepositoryCount(client *github.Client) (int64, error){
ctx := context.Background()
opt := &github.RepositoryListAllOptions{
Since: 0,
}

sinceValue := 0
lastValue := 0

for {
repos, _, err := client.Repositories.ListAll(ctx, opt)
if err != nil {
return -1, err
}
for _, repo := range repos {
if !*repo.Fork {
sinceValue = int(*repo.ID)
}
}
if len(repos) == 0 {
if sinceValue == lastValue {
return int64(sinceValue), nil
}
sinceValue = (lastValue + sinceValue) / 2
} else {
lastValue = sinceValue
sinceValue *= 2
}


opt = &github.RepositoryListAllOptions{
Since: int64(sinceValue),
}
}
return 0, nil
}

func GetAllRepositories(client *github.Client, start int64, end int64) ([]*GithubRepository, error) {
var allRepos []*GithubRepository
ctx := context.Background()
opt := &github.RepositoryListAllOptions{
Since: start,
}

hard_coded_branch := "master"

scraped := false
sinceValue := start

for scraped != true {
repos, _, err := client.Repositories.ListAll(ctx, opt)
if err != nil {
return allRepos, err
}
for _, repo := range repos {
if !*repo.Fork {
r := GithubRepository{
Owner: repo.Owner.Login,
ID: repo.ID,
Name: repo.Name,
FullName: repo.FullName,
CloneURL: repo.CloneURL,
URL: repo.HTMLURL,
DefaultBranch: &hard_coded_branch,
Description: repo.Description,
Homepage: repo.Homepage,
}
allRepos = append(allRepos, &r)

sinceValue = int64(*r.ID)

if sinceValue >= end {
return allRepos, nil
}
}
}
if len(repos) == 0 {
scraped = true
}
opt = &github.RepositoryListAllOptions{
Since: int64(sinceValue),
}
}

return allRepos, nil
}
2 changes: 2 additions & 0 deletions core/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ type Options struct {
CommitDepth *int
GithubAccessToken *string `json:"-"`
NoExpandOrgs *bool
GatherAll *bool
Threads *int
Save *string `json:"-"`
Load *string `json:"-"`
Expand All @@ -23,6 +24,7 @@ func ParseOptions() (Options, error) {
CommitDepth: flag.Int("commit-depth", 500, "Number of repository commits to process"),
GithubAccessToken: flag.String("github-access-token", "", "GitHub access token to use for API requests"),
NoExpandOrgs: flag.Bool("no-expand-orgs", false, "Don't add members to targets when processing organizations"),
GatherAll: flag.Bool("gather-all", false, "Gather all repositories on the domain"),
Threads: flag.Int("threads", 0, "Number of concurrent threads (default number of logical CPUs)"),
Save: flag.String("save", "", "Save session to file"),
Load: flag.String("load", "", "Load session file"),
Expand Down
59 changes: 55 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strings"
"sync"
"time"

"math"
"github.com/michenriksen/gitrob/core"
)

Expand Down Expand Up @@ -86,6 +86,52 @@ func GatherRepositories(sess *core.Session) {
wg.Wait()
}

func GatherReposConcurrent(sess *core.Session, thread_num int, start int64, end int64, wg *sync.WaitGroup) {
go func() {
sess.Out.Debug(" Thread [%d] for repository gathering: [%d:%d]\n", thread_num, start, end)
repos, err := core.GetAllRepositories(sess.GithubClient, start, end)
if err != nil {
sess.Out.Error(" Failed to retrieve all repositories %s\n", err)
}

for _, repo := range repos {
sess.Out.Debug(" Retrieved repository: %s\n", *repo.FullName)
sess.AddRepository(repo)
}

sess.Out.Info(" Thread [%d] Retrieved %d %s\n", thread_num, len(repos), core.Pluralize(len(repos), "repository", "repositories"))
wg.Done()
}()
}

func GatherAllRepositories(sess *core.Session) {
var wg sync.WaitGroup
var threadNum int

count, err := core.DetermineRepositoryCount(sess.GithubClient)
if err != nil {
sess.Out.Error( "Failed to find upper limit on repositories. Setting threads to 1")
threadNum = 1
count = math.MaxInt64
} else {
threadNum = *sess.Options.Threads
}

sess.Out.Debug("Threads for repository gathering: %d\n", threadNum)

bounds := int(count) / threadNum

wg.Add(threadNum)
for i := 0; i < threadNum; i++ {
end := int64((i + 1) * bounds)
start := int64(end - int64(bounds))
GatherReposConcurrent(sess, i, start, end, &wg)
}

wg.Wait()
sess.Out.Info("Finished Pulling All Repos\n")
}

func AnalyzeRepositories(sess *core.Session) {
sess.Stats.Status = core.StatusAnalyzing
var ch = make(chan *core.GithubRepository, len(sess.Repositories))
Expand Down Expand Up @@ -115,7 +161,7 @@ func AnalyzeRepositories(sess *core.Session) {
}

sess.Out.Debug("[THREAD #%d][%s] Cloning repository...\n", tid, *repo.FullName)
clone, path, err := core.CloneRepository(repo.CloneURL, repo.DefaultBranch, *sess.Options.CommitDepth)
clone, path, err := core.CloneRepository(repo, *sess.Options.CommitDepth)
if err != nil {
if err.Error() != "remote repository is empty" {
sess.Out.Error("Error cloning repository %s: %s\n", *repo.FullName, err)
Expand Down Expand Up @@ -223,12 +269,17 @@ func main() {
if sess.Stats.Status == "finished" {
sess.Out.Important("Loaded session file: %s\n", *sess.Options.Load)
} else {
if len(sess.Options.Logins) == 0 {
if len(sess.Options.Logins) == 0 && !*sess.Options.GatherAll {
sess.Out.Fatal("Please provide at least one GitHub organization or user\n")
}
}

GatherTargets(sess)
GatherRepositories(sess)

if *sess.Options.GatherAll {
GatherAllRepositories(sess)
}

AnalyzeRepositories(sess)
sess.Finish()

Expand Down