diff --git a/Makefile b/Makefile index ae2c0ae..5c90f89 100644 --- a/Makefile +++ b/Makefile @@ -6,29 +6,36 @@ LAMBDA_SABA_DISAMBIGUATOR_RULE_NAME ?= MackerelSocialNextCron export CGO_ENABLED := 0 +.PHONY: import-pos import-pos: touch _pos.json pos.json && cat _pos.json pos.json | jq -r .id_str > pos_cache_ids - cat data/pos.txt | go run import_json.go pos_cache_ids | tee -a _pos.json + go run import_json.go -a _pos.json pos_cache_ids pos.json +.PHONY: import-neg import-neg: touch _neg.json neg.json && cat _neg.json neg.json | jq -r .id_str > neg_cache_ids - cat data/neg.txt | go run import_json.go neg_cache_ids | tee -a _neg.json + go run import_json.go -a _neg.json neg_cache_ids neg.json +.PHONY: import import: @make import-pos import-neg +.PHONY: clean clean: - rm _neg.json _pos.json neg.json neg_cache_ids pos.json pos_cache_ids + rm -f _neg.json _pos.json neg.json neg_cache_ids pos.json pos_cache_ids +.PHONY: learn learn: go run train_perceptron.go pos.json neg.json +.PHONY: format format: gofmt -w functions/**/*.go lib/*.go *.go goimports -w functions/**/*.go lib/*.go *.go +.PHONY: sam-package sam-package: cd functions/saba_disambiguator; GOARCH=amd64 GOOS=linux go build -o build/saba_disambiguator main.go if aws s3 ls "s3://${BUCKET_NAME}" 2>&1 | grep -q 'AccessDenied'; then \ @@ -43,11 +50,10 @@ sam-package: --s3-prefix ${S3_PREFIX} \ --output-template-file sam.yml \ +.PHONY: sam-deploy sam-deploy: ${AWSCMD} deploy \ --template-file sam.yml \ --stack-name ${STACK_NAME} \ --parameter-overrides LambdaSabaDisambiguatorRuleName=${LAMBDA_SABA_DISAMBIGUATOR_RULE_NAME} \ --capabilities CAPABILITY_IAM - -.PHONY: import learn sam-package sam-deploy diff --git a/import_json.go b/import_json.go index 4de7426..4baa831 100644 --- a/import_json.go +++ b/import_json.go @@ -5,7 +5,9 @@ package main import ( "bufio" + "flag" "fmt" + "io" "log" "os" "strings" @@ -47,12 +49,27 @@ func cacheIdsFromFile(filename string) (map[int64]struct{}, error) { return cachedIds, nil } +var flagAppend = flag.String("a", "", "append new tweets to `file`") + +type WriteSyncer interface { + io.Writer + Sync() error +} + +type nopWriter struct{} + +func (*nopWriter) Write(p []byte) (int, error) { return len(p), nil } +func (*nopWriter) Sync() error { return nil } + func main() { log.SetFlags(0) + flag.Parse() + config, err := sabadisambiguator.GetConfigFromFile("functions/saba_disambiguator/build/config.yml") if err != nil { log.Fatalf("failed to load config: %v\n", err) } + svc := ssm.New(session.New(), &aws.Config{ Region: aws.String(config.Region), }) @@ -62,11 +79,21 @@ func main() { log.Fatalf("failed to get Twitter client: %v\n", err) } - cachedIds, err := cacheIdsFromFile(os.Args[1]) + cachedIds, err := cacheIdsFromFile(flag.Arg(0)) if err != nil { log.Fatalf("failed to read cache: %v\n", err) } + var w WriteSyncer = &nopWriter{} + if *flagAppend != "" { + f, err := os.OpenFile(*flagAppend, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + log.Fatalf("failed to open '%s': %v\n", *flagAppend, err) + } + defer f.Close() + w = f + } + stdin := bufio.NewScanner(os.Stdin) for stdin.Scan() { text := stdin.Text() @@ -87,8 +114,12 @@ func main() { tweetJson, _ := json.Marshal(tweet) fmt.Println(string(tweetJson)) + fmt.Fprintln(w, string(tweetJson)) } if err := stdin.Err(); err != nil { log.Fatalln(err) } + if err := w.Sync(); err != nil { + log.Fatalf("failed to flush tweets: %v\n", err) + } } diff --git a/lib/example.go b/lib/example.go index 3efbc85..5ef2a41 100644 --- a/lib/example.go +++ b/lib/example.go @@ -21,8 +21,8 @@ type Example struct { type Examples []*Example -func NewExample(tweet twitter.Tweet, label LabelType) *Example { - fv := ExtractFeatures(tweet) +func NewExampleWithOptions(tweet twitter.Tweet, label LabelType, opts ExtractOptions) *Example { + fv := ExtractFeaturesWithOptions(tweet, opts) return &Example{Label: label, Fv: fv, Tweet: tweet} } diff --git a/lib/feature.go b/lib/feature.go index 86e4e15..2b56fde 100644 --- a/lib/feature.go +++ b/lib/feature.go @@ -145,10 +145,6 @@ func (opts *ExtractOptions) includeScreenNameInReplyToScreenName(t twitter.Tweet return opts.contains(t.InReplyToScreenName) } -func ExtractFeatures(t twitter.Tweet) FeatureVector { - return ExtractFeaturesWithOptions(t, ExtractOptions{}) -} - func ExtractFeaturesWithOptions(t twitter.Tweet, opts ExtractOptions) FeatureVector { var fv FeatureVector text := t.Text @@ -156,7 +152,7 @@ func ExtractFeaturesWithOptions(t twitter.Tweet, opts ExtractOptions) FeatureVec fv = append(fv, "BIAS") fv = append(fv, "ScreenName:"+t.User.ScreenName) fv = append(fv, "inReplyToScreenName:"+inReplyToScreenName(t)) - fv = append(fv, "screenNameInQuotedStatus"+screenNameInQuotedStatus(t)) + fv = append(fv, "screenNameInQuotedStatus:"+screenNameInQuotedStatus(t)) fv = append(fv, "lang:"+lang(t)) fv = append(fv, "containsMackerelInScreenName:"+strconv.FormatBool(opts.contains(t.User.ScreenName))) fv = append(fv, "includeMackerelInUserMentions:"+strconv.FormatBool(opts.includeScreenNameInUserMentions(t))) diff --git a/train_perceptron.go b/train_perceptron.go index 563bf63..e0366ca 100644 --- a/train_perceptron.go +++ b/train_perceptron.go @@ -13,6 +13,8 @@ import ( sabadisambiguator "github.com/syou6162/saba_disambiguator/lib" ) +var config *sabadisambiguator.Config + func parseLine(line string) (twitter.Tweet, error) { var tweet twitter.Tweet err := json.Unmarshal([]byte(line), &tweet) @@ -36,7 +38,9 @@ func readExamplesFromFile(fileName string, label sabadisambiguator.LabelType) (s continue } - e := sabadisambiguator.NewExample(t, label) + e := sabadisambiguator.NewExampleWithOptions(t, label, sabadisambiguator.ExtractOptions{ + ScreenNames: config.ScreenNames, + }) examples = append(examples, e) } if err := scanner.Err(); err != nil { @@ -45,8 +49,26 @@ func readExamplesFromFile(fileName string, label sabadisambiguator.LabelType) (s return examples, nil } +func loadConfig(file string) (*sabadisambiguator.Config, error) { + c, err := sabadisambiguator.GetConfigFromFile(file) + if err != nil { + if os.IsNotExist(err) { + return &sabadisambiguator.Config{}, nil + } + return nil, err + } + return c, nil +} + func main() { log.SetFlags(0) + + c, err := loadConfig("functions/saba_disambiguator/build/config.yml") + if err != nil { + log.Fatalf("failed to load config: %v\n", err) + } + config = c + examplesPos, err := readExamplesFromFile(os.Args[1], sabadisambiguator.POSITIVE) if err != nil { log.Fatalf("failed to read %s: %v\n", os.Args[1], err)