diff --git a/src/org/clojurenlp/annotations.clj b/src/org/clojurenlp/annotations.clj new file mode 100644 index 0000000..d466ff8 --- /dev/null +++ b/src/org/clojurenlp/annotations.clj @@ -0,0 +1,204 @@ +(ns org.clojurenlp.annotations + (:import + (edu.stanford.nlp.dcoref + CorefCoreAnnotations$CorefChainAnnotation) + (edu.stanford.nlp.ling + CoreAnnotations + CoreAnnotations$AfterAnnotation + CoreAnnotations$AuthorAnnotation + CoreAnnotations$BeforeAnnotation + CoreAnnotations$CharacterOffsetBeginAnnotation + CoreAnnotations$CharacterOffsetEndAnnotation + CoreAnnotations$DocDateAnnotation + CoreAnnotations$DocIDAnnotation + CoreAnnotations$DocSourceTypeAnnotation + CoreAnnotations$DocTypeAnnotation + CoreAnnotations$IndexAnnotation + CoreAnnotations$KBPTriplesAnnotation + CoreAnnotations$LineNumberAnnotation + CoreAnnotations$LocationAnnotation + CoreAnnotations$MentionsAnnotation + CoreAnnotations$NamedEntityTagAnnotation + CoreAnnotations$NormalizedNamedEntityTagAnnotation + CoreAnnotations$PartOfSpeechAnnotation + CoreAnnotations$OriginalTextAnnotation + CoreAnnotations$QuotationIndexAnnotation + CoreAnnotations$QuotationsAnnotation + CoreAnnotations$SectionDateAnnotation + CoreAnnotations$SectionsAnnotation + CoreAnnotations$SentencesAnnotation + CoreAnnotations$SentenceBeginAnnotation + CoreAnnotations$SentenceEndAnnotation + CoreAnnotations$SentenceIDAnnotation + CoreAnnotations$SentenceIndexAnnotation + CoreAnnotations$SpeakerAnnotation + CoreAnnotations$TextAnnotation + CoreAnnotations$TokensAnnotation + CoreAnnotations$TokenBeginAnnotation + CoreAnnotations$TokenEndAnnotation + CoreAnnotations$TrueCaseAnnotation + CoreAnnotations$TrueCaseTextAnnotation + CoreAnnotations$WikipediaEntityAnnotation) + (edu.stanford.nlp.naturalli + NaturalLogicAnnotations + NaturalLogicAnnotations$RelationTriplesAnnotation) + (edu.stanford.nlp.neural.rnn + RNNCoreAnnotations) + (edu.stanford.nlp.pipeline + Annotation + QuoteAnnotator + QuoteAttributionAnnotator$SpeakerAnnotation) + (edu.stanford.nlp.semgraph + SemanticGraphCoreAnnotations$BasicDependenciesAnnotation + SemanticGraphCoreAnnotations$EnhancedPlusPlusDependenciesAnnotation + SemanticGraphCoreAnnotations$EnhancedDependenciesAnnotation) + (edu.stanford.nlp.sentiment + SentimentCoreAnnotations$SentimentAnnotatedTree + SentimentCoreAnnotations$SentimentClass) + (edu.stanford.nlp.time TimeAnnotations$TimexAnnotation) + (edu.stanford.nlp.trees + TreeCoreAnnotations$TreeAnnotation) + (edu.stanford.nlp.util CoreMap) + (org.ejml.simple SimpleMatrix))) + + + +;;; Text + +(defn get-text [ann] (.get ann CoreAnnotations$TextAnnotation)) + +(defn get-original-text [ann] + (.get ann CoreAnnotations$OriginalTextAnnotation)) + +(defn get-index [ann] + (.get ann CoreAnnotations$IndexAnnotation)) + +(defn get-character-offset-begin [ann] + (.get ann CoreAnnotations$CharacterOffsetBeginAnnotation)) + +(defn get-character-offset-end [ann] + (.get ann CoreAnnotations$CharacterOffsetEndAnnotation)) + +;; pos + +(defn get-part-of-speech [ann] + (.get ann CoreAnnotations$PartOfSpeechAnnotation)) + +;;; Tokens + +(defn get-tokens [^Annotation ann] + (.get ann CoreAnnotations$TokensAnnotation)) + +(defn get-token-begin [ann] + (.get ann CoreAnnotations$TokenBeginAnnotation)) + +(defn get-token-end [ann] + (.get ann CoreAnnotations$TokenEndAnnotation)) + +(defn get-speaker [token] + (.get token CoreAnnotations$SpeakerAnnotation)) + +(defn get-true-case [token] + (.get token CoreAnnotations$TrueCaseAnnotation)) + +(defn get-true-case-text [token] + (.get token CoreAnnotations$TrueCaseTextAnnotation)) + ; +(defn get-before [token] + (.get token CoreAnnotations$BeforeAnnotation)) + +(defn get-after [token] + (.get token CoreAnnotations$AfterAnnotation)) + + +;;; Document + +(defn get-doc-id [doc] (.get doc CoreAnnotations$DocIDAnnotation)) +(defn get-doc-date [doc] (.get doc CoreAnnotations$DocDateAnnotation)) +(defn get-doc-source-type [doc] (.get doc CoreAnnotations$DocSourceTypeAnnotation)) +(defn get-doc-type [doc] (.get doc CoreAnnotations$DocTypeAnnotation)) +(defn get-author [doc] (.get doc CoreAnnotations$AuthorAnnotation)) +(defn get-location [doc] (.get doc CoreAnnotations$LocationAnnotation)) + +;;; Sentences + +(defn get-sentences [ann] (.get ann CoreAnnotations$SentencesAnnotation)) +#_(defn get-before [sentence] edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation) +#_(defn get-after [sentence] CoreAnnotations$AfterAnnotation) + +(defn get-sentence-id [sentence] + (.get sentence CoreAnnotations$SentenceIDAnnotation)) +(defn get-sentence-index [sentence] + (.get sentence CoreAnnotations$SentenceIndexAnnotation)) +(defn get-line-number [sentence] + (.get sentence CoreAnnotations$LineNumberAnnotation)) +(defn get-tree [sentence] + (.get sentence TreeCoreAnnotations$TreeAnnotation)) ; note the "Tree"Core. +(defn get-basic-dependencies [sentence] + (.get sentence SemanticGraphCoreAnnotations$BasicDependenciesAnnotation)) +(defn get-enhanced-dependencies [sentence] + (.get sentence SemanticGraphCoreAnnotations$EnhancedDependenciesAnnotation)) +(defn get-enhanced-plus-plus-dependencies [sentence] + (.get sentence SemanticGraphCoreAnnotations$EnhancedPlusPlusDependenciesAnnotation)) + +;; Sentiment + +(defn get-sentiment-class [sentence] + (.get sentence SentimentCoreAnnotations$SentimentClass)) + +(defn get-sentiment-annotated-tree [sentence] + (.get sentence SentimentCoreAnnotations$SentimentAnnotatedTree)) + +(defn get-sentiment [sentiment-tree] + (RNNCoreAnnotations/getPredictedClass sentiment-tree)) + +#_(defn get-sentiment-predictions [sentiment-tree] + (RNNCoreAnnotations/getPredictionsAsStringList sentiment-tree)) + +;; OpenIE +(defn get-relation-triples [sentence] + (.get sentence NaturalLogicAnnotations$RelationTriplesAnnotation)) + +;; KBP +(defn get-kbp-triples [sentence] + (.get sentence CoreAnnotations$KBPTriplesAnnotation)) + +;; Entity mentions +(defn get-mentions [sentence] + (.get sentence CoreAnnotations$MentionsAnnotation)) +(defn get-named-entity-tag [mention] + (.get mention CoreAnnotations$NamedEntityTagAnnotation)) +(defn get-normalized-named-entity-tag [mention] + (.get mention CoreAnnotations$NormalizedNamedEntityTagAnnotation)) +(defn get-wikipedia-entity [mention] + (.get mention CoreAnnotations$WikipediaEntityAnnotation)) +(defn get-time [ann] (.get ann TimeAnnotations$TimexAnnotation)) + +;;; Quotes + +(defn get-quotations [ann] + (.get ann CoreAnnotations$QuotationsAnnotation)) + +(defn gather-quotes [ann] (QuoteAnnotator/gatherQuotes ann)) + +(defn get-quotation-index [quote] + (.get quote CoreAnnotations$QuotationIndexAnnotation)) + +(defn get-sentence-begin [quote] + (.get quote CoreAnnotations$SentenceBeginAnnotation)) + +(defn get-sentence-end [quote] + (.get quote CoreAnnotations$SentenceEndAnnotation)) + +(defn get-speaker [quote] + (.get quote QuoteAttributionAnnotator$SpeakerAnnotation)) + +;; Sections +(defn get-sections [ann] (.get ann CoreAnnotations$SectionsAnnotation)) +(defn get-section-date [section] + (.get section CoreAnnotations$SectionDateAnnotation)) + +;; corefs +(defn get-coref-chain [doc] + (.get doc CorefCoreAnnotations$CorefChainAnnotation)) + diff --git a/src/org/clojurenlp/core.clj b/src/org/clojurenlp/core.clj index c9349ab..9ec2c93 100644 --- a/src/org/clojurenlp/core.clj +++ b/src/org/clojurenlp/core.clj @@ -1,30 +1,27 @@ (ns org.clojurenlp.core (:require + [org.clojurenlp.annotations :as ann] [clojure.data.json :as json] [clojure.set :as set] [loom.attr :as attr] [loom.graph :as graph]) - (:import (java.io StringReader) - (java.util ArrayList - Collection - Map - Properties) - (edu.stanford.nlp.process DocumentPreprocessor - PTBTokenizer) - (edu.stanford.nlp.ling CoreLabel TaggedWord Word) - (edu.stanford.nlp.tagger.maxent MaxentTagger) - (edu.stanford.nlp.trees LabeledScoredTreeNode - LabeledScoredTreeReaderFactory - PennTreebankLanguagePack - TypedDependency) - (edu.stanford.nlp.parser.common ParserGrammar) - (edu.stanford.nlp.parser.lexparser LexicalizedParser) - (edu.stanford.nlp.pipeline Annotation StanfordCoreNLP) - (edu.stanford.nlp.ling CoreAnnotations$SentencesAnnotation - CoreAnnotations$TextAnnotation - CoreAnnotations$NamedEntityTagAnnotation - CoreAnnotations$TokensAnnotation - Word)) + (:import + (java.io StringReader) + (java.util ArrayList + Collection + Map + Properties) + (edu.stanford.nlp.process DocumentPreprocessor + PTBTokenizer) + (edu.stanford.nlp.ling CoreLabel TaggedWord Word) + (edu.stanford.nlp.tagger.maxent MaxentTagger) + (edu.stanford.nlp.trees LabeledScoredTreeNode + LabeledScoredTreeReaderFactory + PennTreebankLanguagePack + TypedDependency) + (edu.stanford.nlp.parser.common ParserGrammar) + (edu.stanford.nlp.parser.lexparser LexicalizedParser) + (edu.stanford.nlp.pipeline Annotation StanfordCoreNLP)) (:gen-class :main true)) (defn pprint-methods! @@ -52,12 +49,11 @@ (defn tokenize [text] (let [core-labels (tokenize-corelabels text)] (map #(assoc {} - :token (.get % CoreAnnotations$TextAnnotation) + :token (ann/get-text %) :start-offset (.beginPosition %) :end-offset (.endPosition %)) core-labels))) - (defn split-sentences [text] "Split a string into a sequence of sentences, each of which is a sequence of CoreLabels" (let [rdr (StringReader. text)] @@ -72,7 +68,7 @@ (last (map #(.endPosition %) core-labels))) (defn sentence-text [core-labels] - (map #(.get % CoreAnnotations$TextAnnotation) core-labels)) + (map ann/get-text core-labels)) (defn sentenize [text] @@ -150,8 +146,8 @@ (defn- get-tokens-entities "builds map: {:token token :named-entity named-entity}" [tok-ann] - {:token (.get tok-ann CoreAnnotations$TextAnnotation) - :named-entity (.get tok-ann CoreAnnotations$NamedEntityTagAnnotation) + {:token (ann/get-text tok-ann) + :named-entity (ann/get-named-entity-tag tok-ann) :start-offset (.beginPosition tok-ann) :end-offset (.endPosition tok-ann)}) @@ -159,7 +155,7 @@ "Passes TokenAnnotations extracted from SentencesAnnotation to get-tokens-entities which returns a map {:token token :named-entity ne}" [sentence-annotation] - (map get-tokens-entities (.get sentence-annotation CoreAnnotations$TokensAnnotation))) + (map get-tokens-entities (ann/get-tokens sentence-annotation))) (defn- get-text-tokens [sen-ann] "builds map: {:tokens tokens}" @@ -170,7 +166,7 @@ get-text-tokens which returns a map: {:tokens {:token token :named-entity ne}}" [^Annotation annotation] - (map get-text-tokens (.get annotation CoreAnnotations$SentencesAnnotation))) + (map get-text-tokens (ann/get-sentences annotation))) (defn tag-ner "Returns a map object containing original text, tokens, sentences"