|
10 | 10 | import java.util.Set;
|
11 | 11 | import java.util.stream.Collectors;
|
12 | 12 | import java.util.stream.Stream;
|
13 |
| - |
14 | 13 | import javax.ws.rs.WebApplicationException;
|
15 | 14 | import javax.xml.bind.JAXBException;
|
16 |
| - |
17 | 15 | import com.fasterxml.jackson.databind.ObjectMapper;
|
18 | 16 | import com.fasterxml.jackson.databind.SerializationFeature;
|
19 | 17 | import com.google.devtools.common.options.OptionsParser;
|
|
23 | 21 | import com.weblyzard.api.model.document.Document;
|
24 | 22 | import com.weblyzard.api.model.document.MirrorDocument;
|
25 | 23 | import com.weblyzard.api.model.jesaja.KeywordCalculationProfile;
|
26 |
| - |
27 | 24 | import lombok.extern.slf4j.Slf4j;
|
28 | 25 |
|
29 | 26 | @Slf4j
|
|
34 | 31 | *
|
35 | 32 | */
|
36 | 33 | public class KeywordExtractor {
|
37 |
| - |
38 |
| - private static JeremiaClient preProcessingClient; |
39 |
| - private static JesajaClient keywordExtractionClient; |
40 |
| - |
41 |
| - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); |
42 |
| - |
43 |
| - private static KeywordCalculationProfile KEYWORD_PROFILE = new KeywordCalculationProfile() |
44 |
| - .setValidPosTags(Set.of("NE", "NNS", "NN", "NNP", "P", "ADJ")) |
45 |
| - .setMinPhraseSignificance(2) |
46 |
| - .setNumKeywords(15) |
47 |
| - .setKeywordAlgorithm("com.weblyzard.backend.jesaja.algorithm.keywords.YatesKeywordSignificanceAlgorithm") |
48 |
| - .setMinTokenCount(5) |
49 |
| - .setSkipUnderrepresentedKeywords(true); |
50 |
| - |
51 |
| - public static void main(String[] argv) throws IOException, JAXBException { |
52 |
| - OptionsParser parser = OptionsParser.newOptionsParser(KeywordExtractorOption.class); |
53 |
| - parser.parseAndExitUponError(argv); |
54 |
| - KeywordExtractorOption options = parser.getOptions(KeywordExtractorOption.class); |
55 |
| - |
56 |
| - if (options.printHelp || options.webServiceBaseUrl.isEmpty() || options.profileName.isEmpty()) { |
57 |
| - printUsage(parser); |
58 |
| - return; |
59 |
| - } |
60 |
| - |
61 |
| - // setup web services |
62 |
| - setupWebServices(options); |
63 |
| - |
64 |
| - // train the component with the provided reference corpus |
65 |
| - if (!options.referenceCorpusDirectory.isEmpty()) { |
66 |
| - List<Document> documents = getDocuments(options.referenceCorpusDirectory); |
67 |
| - trainJesaja(options.profileName, documents); |
68 |
| - } |
69 |
| - |
70 |
| - // compute keywords |
71 |
| - if (!options.targetCorpusDirectory.isEmpty()) { |
72 |
| - List<Document> documents = getDocuments(options.targetCorpusDirectory); |
73 |
| - try { |
74 |
| - Map<String, Map<String, Double>> keywords = keywordExtractionClient.getKeywords(options.profileName, documents); |
75 |
| - System.out.println(OBJECT_MAPPER.writeValueAsString(keywords)); |
76 |
| - } catch (WebApplicationException | JAXBException e) { |
77 |
| - log.error("Cannot extract keywords: {}", e); |
78 |
| - System.exit(-1); |
79 |
| - } |
80 |
| - } |
81 |
| - |
82 |
| - } |
83 |
| - |
84 |
| - /** |
85 |
| - * Setup and configure the Web services based on the provided {@link KeywordExtractorOption}s |
86 |
| - * |
87 |
| - * @param options used for the Web service configuration |
88 |
| - */ |
89 |
| - private static void setupWebServices(KeywordExtractorOption options) { |
90 |
| - WebserviceClientConfig jeremiaConfig = new WebserviceClientConfig().setUrl(options.webServiceBaseUrl) |
91 |
| - .setUsername(options.webServiceUserName).setPassword(options.webServiceUserPassword); |
92 |
| - WebserviceClientConfig jesajaConfig = new WebserviceClientConfig().setUrl(options.webServiceBaseUrl) |
93 |
| - .setUsername(options.webServiceUserName).setPassword(options.webServiceUserPassword); |
94 |
| - System.out.println(jeremiaConfig); |
95 |
| - // use standard service ports, if the web service has been deployed locally |
96 |
| - if (options.webServiceBaseUrl.startsWith("http://localhost") || options.webServiceBaseUrl.startsWith("http://127.0.0.1")) { |
97 |
| - jeremiaConfig.setServicePrefix(":63001"); |
98 |
| - jesajaConfig.setServicePrefix(":63002"); |
99 |
| - } |
100 |
| - |
101 |
| - preProcessingClient = new JeremiaClient(jeremiaConfig); |
102 |
| - keywordExtractionClient = new JesajaClient(jesajaConfig); |
103 |
| - |
104 |
| - // setup keyword service configuration |
105 |
| - keywordExtractionClient.setKeywordProfile(options.profileName, KEYWORD_PROFILE); |
106 |
| - keywordExtractionClient.setMatviewProfile(options.profileName, options.profileName); |
107 |
| - } |
108 |
| - |
109 |
| - /** |
110 |
| - * Train the keyword extraction service with the documents provided in the reference |
111 |
| - * corpus. |
112 |
| - * |
113 |
| - * @param profileName |
114 |
| - * @param documents |
115 |
| - */ |
116 |
| - private static void trainJesaja(String profileName, List<Document> documents) { |
117 |
| - try { |
118 |
| - while (keywordExtractionClient.rotateShard(profileName) == 0) { |
119 |
| - keywordExtractionClient.addDocuments(profileName, documents); |
120 |
| - } |
121 |
| - } catch (WebApplicationException | JAXBException e) { |
122 |
| - log.error("Cannot train keyword service: {}", e); |
123 |
| - System.exit(-1); |
124 |
| - } |
125 |
| - } |
126 |
| - |
127 |
| - |
128 |
| - /** |
129 |
| - * Read all documents from the given directory, perform pre-processing and |
130 |
| - * convert them into a list of {@link Document} objects. |
131 |
| - * |
132 |
| - * @param documentDirectory |
133 |
| - * @return |
134 |
| - */ |
135 |
| - private static List<Document> getDocuments(String documentDirectory) { |
136 |
| - try { |
137 |
| - return getDocuments(Files.list(Paths.get(documentDirectory))); |
138 |
| - } catch (IOException e) { |
139 |
| - log.error("Cannot open corpus directory '{}': {}", documentDirectory, e); |
140 |
| - System.exit(-1); |
141 |
| - } |
142 |
| - return null; |
143 |
| - } |
144 |
| - |
145 |
| - /** |
146 |
| - * @return a list of Document objects |
147 |
| - */ |
148 |
| - private static List<Document> getDocuments(Stream<Path> documents) { |
149 |
| - List<MirrorDocument> inputDocuments = documents.map(documentPath -> |
150 |
| - { |
151 |
| - try { |
152 |
| - return new MirrorDocument().setId(documentPath.toString()).setBody(new String(Files.readAllBytes(documentPath))); |
153 |
| - } catch (IOException e) { |
154 |
| - log.warn("Cannot open input document '{}': {}", documentPath, e); |
155 |
| - return null; |
156 |
| - } |
157 |
| - } |
158 |
| - ).filter(document -> document != null).collect(Collectors.toList()); |
159 |
| - |
160 |
| - // create the input structure for the pre-processing web service |
161 |
| - return preProcessingClient.submitDocuments(inputDocuments, "-1"); |
162 |
| - } |
163 |
| - |
164 |
| - |
165 |
| - |
166 |
| - /** |
167 |
| - * Provide usage information for the given {@link OptionsParser}. |
168 |
| - * |
169 |
| - * @param parser |
170 |
| - */ |
171 |
| - private static void printUsage(OptionsParser parser) { |
172 |
| - System.out.println("Usage: java -jar example-keyword-extractor OPTIONS"); |
173 |
| - System.out.println(parser.describeOptions(Collections.emptyMap(), OptionsParser.HelpVerbosity.LONG)); |
174 |
| - } |
175 |
| - |
| 34 | + |
| 35 | + private static JeremiaClient preProcessingClient; |
| 36 | + private static JesajaClient keywordExtractionClient; |
| 37 | + |
| 38 | + private static final ObjectMapper OBJECT_MAPPER = |
| 39 | + new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); |
| 40 | + |
| 41 | + private static KeywordCalculationProfile KEYWORD_PROFILE = new KeywordCalculationProfile() |
| 42 | + .setValidPosTags(Set.of("NE", "NNS", "NN", "NNP", "P", "ADJ")) |
| 43 | + .setMinPhraseSignificance(2).setNumKeywords(15) |
| 44 | + .setKeywordAlgorithm( |
| 45 | + "com.weblyzard.backend.jesaja.algorithm.keywords.YatesKeywordSignificanceAlgorithm") |
| 46 | + .setMinTokenCount(5).setSkipUnderrepresentedKeywords(true); |
| 47 | + |
| 48 | + public static void main(String[] argv) throws IOException, JAXBException { |
| 49 | + OptionsParser parser = OptionsParser.newOptionsParser(KeywordExtractorOption.class); |
| 50 | + parser.parseAndExitUponError(argv); |
| 51 | + KeywordExtractorOption options = parser.getOptions(KeywordExtractorOption.class); |
| 52 | + |
| 53 | + if (options.printHelp || options.webServiceBaseUrl.isEmpty() |
| 54 | + || options.profileName.isEmpty()) { |
| 55 | + printUsage(parser); |
| 56 | + return; |
| 57 | + } |
| 58 | + |
| 59 | + // setup web services |
| 60 | + setupWebServices(options); |
| 61 | + |
| 62 | + // train the component with the provided reference corpus |
| 63 | + if (!options.referenceCorpusDirectory.isEmpty()) { |
| 64 | + List<Document> documents = getDocuments(options.referenceCorpusDirectory); |
| 65 | + trainJesaja(options.profileName, documents); |
| 66 | + } |
| 67 | + |
| 68 | + // compute keywords |
| 69 | + if (!options.targetCorpusDirectory.isEmpty()) { |
| 70 | + List<Document> documents = getDocuments(options.targetCorpusDirectory); |
| 71 | + try { |
| 72 | + Map<String, Map<String, Double>> keywords = |
| 73 | + keywordExtractionClient.getKeywords(options.profileName, documents); |
| 74 | + System.out.println(OBJECT_MAPPER.writeValueAsString(keywords)); |
| 75 | + } catch (WebApplicationException | JAXBException e) { |
| 76 | + log.error("Cannot extract keywords: {}", e); |
| 77 | + System.exit(-1); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + } |
| 82 | + |
| 83 | + /** |
| 84 | + * Setup and configure the Web services based on the provided {@link KeywordExtractorOption}s. |
| 85 | + * |
| 86 | + * @param options used for the Web service configuration |
| 87 | + */ |
| 88 | + private static void setupWebServices(KeywordExtractorOption options) { |
| 89 | + WebserviceClientConfig jeremiaConfig = new WebserviceClientConfig() |
| 90 | + .setUrl(options.webServiceBaseUrl).setUsername(options.webServiceUserName) |
| 91 | + .setPassword(options.webServiceUserPassword) |
| 92 | + .setUseCompression(Boolean.getBoolean(options.useCompression)); |
| 93 | + WebserviceClientConfig jesajaConfig = new WebserviceClientConfig() |
| 94 | + .setUrl(options.webServiceBaseUrl).setUsername(options.webServiceUserName) |
| 95 | + .setPassword(options.webServiceUserPassword) |
| 96 | + .setUseCompression(Boolean.getBoolean(options.useCompression)); |
| 97 | + System.out.println(jeremiaConfig); |
| 98 | + // use standard service ports, if the web service has been deployed locally |
| 99 | + if (options.webServiceBaseUrl.startsWith("http://localhost") |
| 100 | + || options.webServiceBaseUrl.startsWith("http://127.0.0.1")) { |
| 101 | + jeremiaConfig.setServicePrefix(":63001"); |
| 102 | + jesajaConfig.setServicePrefix(":63002"); |
| 103 | + } |
| 104 | + |
| 105 | + preProcessingClient = new JeremiaClient(jeremiaConfig); |
| 106 | + keywordExtractionClient = new JesajaClient(jesajaConfig); |
| 107 | + |
| 108 | + // setup keyword service configuration |
| 109 | + keywordExtractionClient.setKeywordProfile(options.profileName, KEYWORD_PROFILE); |
| 110 | + keywordExtractionClient.setMatviewProfile(options.profileName, options.profileName); |
| 111 | + } |
| 112 | + |
| 113 | + /** |
| 114 | + * Train the keyword extraction service with the documents provided in the reference corpus. |
| 115 | + * |
| 116 | + * @param profileName the name of the profile to train |
| 117 | + * @param documents the {@link Document}s used for training |
| 118 | + */ |
| 119 | + private static void trainJesaja(String profileName, List<Document> documents) { |
| 120 | + try { |
| 121 | + while (keywordExtractionClient.rotateShard(profileName) == 0) { |
| 122 | + keywordExtractionClient.addDocuments(profileName, documents); |
| 123 | + } |
| 124 | + } catch (WebApplicationException | JAXBException e) { |
| 125 | + log.error("Cannot train keyword service: {}", e); |
| 126 | + System.exit(-1); |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + /** |
| 131 | + * Read all documents from the given directory, perform pre-processing and convert them into a |
| 132 | + * list. of {@link Document} objects. |
| 133 | + * |
| 134 | + * @param documentDirectory the directory containing the documents |
| 135 | + * @return the list of {@link Document} objects read from the directory |
| 136 | + */ |
| 137 | + private static List<Document> getDocuments(String documentDirectory) { |
| 138 | + try { |
| 139 | + return getDocuments(Files.list(Paths.get(documentDirectory))); |
| 140 | + } catch (IOException e) { |
| 141 | + log.error("Cannot open corpus directory '{}': {}", documentDirectory, e); |
| 142 | + System.exit(-1); |
| 143 | + } |
| 144 | + return null; |
| 145 | + } |
| 146 | + |
| 147 | + /** |
| 148 | + * Returns a list of Document objects. |
| 149 | + */ |
| 150 | + private static List<Document> getDocuments(Stream<Path> documents) { |
| 151 | + List<MirrorDocument> inputDocuments = documents.map(documentPath -> { |
| 152 | + try { |
| 153 | + return new MirrorDocument().setId(documentPath.toString()) |
| 154 | + .setBody(new String(Files.readAllBytes(documentPath))); |
| 155 | + } catch (IOException e) { |
| 156 | + log.warn("Cannot open input document '{}': {}", documentPath, e); |
| 157 | + return null; |
| 158 | + } |
| 159 | + }).filter(document -> document != null).collect(Collectors.toList()); |
| 160 | + |
| 161 | + // create the input structure for the pre-processing web service |
| 162 | + return preProcessingClient.submitDocuments(inputDocuments, "-1"); |
| 163 | + } |
| 164 | + |
| 165 | + /** |
| 166 | + * Provide usage information for the given {@link OptionsParser}. |
| 167 | + */ |
| 168 | + private static void printUsage(OptionsParser parser) { |
| 169 | + System.out.println("Usage: java -jar example-keyword-extractor OPTIONS"); |
| 170 | + System.out.println( |
| 171 | + parser.describeOptions(Collections.emptyMap(), OptionsParser.HelpVerbosity.LONG)); |
| 172 | + } |
| 173 | + |
176 | 174 | }
|
0 commit comments