Skip to content

Commit 8aee5bf

Browse files
Merge pull request #153 from weblyzard/feature/compression-support
Feature/compression support
2 parents e423901 + d4b6abd commit 8aee5bf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+423
-337
lines changed

java-examples/keyword-extraction/pom.xml

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,23 +25,6 @@
2525
</extension>
2626
</extensions>
2727
<plugins>
28-
<plugin>
29-
<groupId>org.apache.maven.plugins</groupId>
30-
<artifactId>maven-eclipse-plugin</artifactId>
31-
<version>2.10</version>
32-
<configuration>
33-
<additionalConfig>
34-
<file>
35-
<name>.settings/org.eclipse.jdt.core.prefs</name>
36-
<url>https://raw.githubusercontent.com/weblyzard/eclipse-settings/master/settings/org.eclipse.jdt.core.prefs</url>
37-
</file>
38-
<file>
39-
<name>.settings/org.eclipse.jdt.ui.prefs</name>
40-
<url>https://raw.githubusercontent.com/weblyzard/eclipse-settings/master/settings/org.eclipse.jdt.ui.prefs</url>
41-
</file>
42-
</additionalConfig>
43-
</configuration>
44-
</plugin>
4528
<plugin>
4629
<groupId>org.apache.maven.plugins</groupId>
4730
<artifactId>maven-compiler-plugin</artifactId>
@@ -52,6 +35,36 @@
5235
</configuration>
5336
</plugin>
5437
<plugin>
38+
<groupId>org.apache.maven.plugins</groupId>
39+
<artifactId>maven-checkstyle-plugin</artifactId>
40+
<version>3.0.0</version>
41+
<dependencies>
42+
<dependency>
43+
<groupId>com.puppycrawl.tools</groupId>
44+
<artifactId>checkstyle</artifactId>
45+
<version>8.14</version>
46+
</dependency>
47+
</dependencies>
48+
<executions>
49+
<execution>
50+
<id>checkstyle</id>
51+
<phase>validate</phase>
52+
<configuration>
53+
<!-- <configLocation>google_checks.xml</configLocation> -->
54+
<configLocation>https://raw.githubusercontent.com/weblyzard/checkstyle-rules/master/java/checkstyle_relaxed.xml</configLocation>
55+
<encoding>UTF-8</encoding>
56+
<consoleOutput>true</consoleOutput>
57+
<linkXRef>false</linkXRef>
58+
<failOnViolation>true</failOnViolation>
59+
<violationSeverity>warning</violationSeverity>
60+
</configuration>
61+
<goals>
62+
<goal>check</goal>
63+
</goals>
64+
</execution>
65+
</executions>
66+
</plugin>
67+
<plugin>
5568
<groupId>org.sonarsource.scanner.maven</groupId>
5669
<artifactId>sonar-maven-plugin</artifactId>
5770
<version>3.4.0.905</version>

java-examples/keyword-extraction/src/main/java/com/weblyzard/api/example/keyword/KeywordExtractor.java

Lines changed: 140 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@
1010
import java.util.Set;
1111
import java.util.stream.Collectors;
1212
import java.util.stream.Stream;
13-
1413
import javax.ws.rs.WebApplicationException;
1514
import javax.xml.bind.JAXBException;
16-
1715
import com.fasterxml.jackson.databind.ObjectMapper;
1816
import com.fasterxml.jackson.databind.SerializationFeature;
1917
import com.google.devtools.common.options.OptionsParser;
@@ -23,7 +21,6 @@
2321
import com.weblyzard.api.model.document.Document;
2422
import com.weblyzard.api.model.document.MirrorDocument;
2523
import com.weblyzard.api.model.jesaja.KeywordCalculationProfile;
26-
2724
import lombok.extern.slf4j.Slf4j;
2825

2926
@Slf4j
@@ -34,143 +31,144 @@
3431
*
3532
*/
3633
public class KeywordExtractor {
37-
38-
private static JeremiaClient preProcessingClient;
39-
private static JesajaClient keywordExtractionClient;
40-
41-
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
42-
43-
private static KeywordCalculationProfile KEYWORD_PROFILE = new KeywordCalculationProfile()
44-
.setValidPosTags(Set.of("NE", "NNS", "NN", "NNP", "P", "ADJ"))
45-
.setMinPhraseSignificance(2)
46-
.setNumKeywords(15)
47-
.setKeywordAlgorithm("com.weblyzard.backend.jesaja.algorithm.keywords.YatesKeywordSignificanceAlgorithm")
48-
.setMinTokenCount(5)
49-
.setSkipUnderrepresentedKeywords(true);
50-
51-
public static void main(String[] argv) throws IOException, JAXBException {
52-
OptionsParser parser = OptionsParser.newOptionsParser(KeywordExtractorOption.class);
53-
parser.parseAndExitUponError(argv);
54-
KeywordExtractorOption options = parser.getOptions(KeywordExtractorOption.class);
55-
56-
if (options.printHelp || options.webServiceBaseUrl.isEmpty() || options.profileName.isEmpty()) {
57-
printUsage(parser);
58-
return;
59-
}
60-
61-
// setup web services
62-
setupWebServices(options);
63-
64-
// train the component with the provided reference corpus
65-
if (!options.referenceCorpusDirectory.isEmpty()) {
66-
List<Document> documents = getDocuments(options.referenceCorpusDirectory);
67-
trainJesaja(options.profileName, documents);
68-
}
69-
70-
// compute keywords
71-
if (!options.targetCorpusDirectory.isEmpty()) {
72-
List<Document> documents = getDocuments(options.targetCorpusDirectory);
73-
try {
74-
Map<String, Map<String, Double>> keywords = keywordExtractionClient.getKeywords(options.profileName, documents);
75-
System.out.println(OBJECT_MAPPER.writeValueAsString(keywords));
76-
} catch (WebApplicationException | JAXBException e) {
77-
log.error("Cannot extract keywords: {}", e);
78-
System.exit(-1);
79-
}
80-
}
81-
82-
}
83-
84-
/**
85-
* Setup and configure the Web services based on the provided {@link KeywordExtractorOption}s
86-
*
87-
* @param options used for the Web service configuration
88-
*/
89-
private static void setupWebServices(KeywordExtractorOption options) {
90-
WebserviceClientConfig jeremiaConfig = new WebserviceClientConfig().setUrl(options.webServiceBaseUrl)
91-
.setUsername(options.webServiceUserName).setPassword(options.webServiceUserPassword);
92-
WebserviceClientConfig jesajaConfig = new WebserviceClientConfig().setUrl(options.webServiceBaseUrl)
93-
.setUsername(options.webServiceUserName).setPassword(options.webServiceUserPassword);
94-
System.out.println(jeremiaConfig);
95-
// use standard service ports, if the web service has been deployed locally
96-
if (options.webServiceBaseUrl.startsWith("http://localhost") || options.webServiceBaseUrl.startsWith("http://127.0.0.1")) {
97-
jeremiaConfig.setServicePrefix(":63001");
98-
jesajaConfig.setServicePrefix(":63002");
99-
}
100-
101-
preProcessingClient = new JeremiaClient(jeremiaConfig);
102-
keywordExtractionClient = new JesajaClient(jesajaConfig);
103-
104-
// setup keyword service configuration
105-
keywordExtractionClient.setKeywordProfile(options.profileName, KEYWORD_PROFILE);
106-
keywordExtractionClient.setMatviewProfile(options.profileName, options.profileName);
107-
}
108-
109-
/**
110-
* Train the keyword extraction service with the documents provided in the reference
111-
* corpus.
112-
*
113-
* @param profileName
114-
* @param documents
115-
*/
116-
private static void trainJesaja(String profileName, List<Document> documents) {
117-
try {
118-
while (keywordExtractionClient.rotateShard(profileName) == 0) {
119-
keywordExtractionClient.addDocuments(profileName, documents);
120-
}
121-
} catch (WebApplicationException | JAXBException e) {
122-
log.error("Cannot train keyword service: {}", e);
123-
System.exit(-1);
124-
}
125-
}
126-
127-
128-
/**
129-
* Read all documents from the given directory, perform pre-processing and
130-
* convert them into a list of {@link Document} objects.
131-
*
132-
* @param documentDirectory
133-
* @return
134-
*/
135-
private static List<Document> getDocuments(String documentDirectory) {
136-
try {
137-
return getDocuments(Files.list(Paths.get(documentDirectory)));
138-
} catch (IOException e) {
139-
log.error("Cannot open corpus directory '{}': {}", documentDirectory, e);
140-
System.exit(-1);
141-
}
142-
return null;
143-
}
144-
145-
/**
146-
* @return a list of Document objects
147-
*/
148-
private static List<Document> getDocuments(Stream<Path> documents) {
149-
List<MirrorDocument> inputDocuments = documents.map(documentPath ->
150-
{
151-
try {
152-
return new MirrorDocument().setId(documentPath.toString()).setBody(new String(Files.readAllBytes(documentPath)));
153-
} catch (IOException e) {
154-
log.warn("Cannot open input document '{}': {}", documentPath, e);
155-
return null;
156-
}
157-
}
158-
).filter(document -> document != null).collect(Collectors.toList());
159-
160-
// create the input structure for the pre-processing web service
161-
return preProcessingClient.submitDocuments(inputDocuments, "-1");
162-
}
163-
164-
165-
166-
/**
167-
* Provide usage information for the given {@link OptionsParser}.
168-
*
169-
* @param parser
170-
*/
171-
private static void printUsage(OptionsParser parser) {
172-
System.out.println("Usage: java -jar example-keyword-extractor OPTIONS");
173-
System.out.println(parser.describeOptions(Collections.emptyMap(), OptionsParser.HelpVerbosity.LONG));
174-
}
175-
34+
35+
private static JeremiaClient preProcessingClient;
36+
private static JesajaClient keywordExtractionClient;
37+
38+
private static final ObjectMapper OBJECT_MAPPER =
39+
new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
40+
41+
private static KeywordCalculationProfile KEYWORD_PROFILE = new KeywordCalculationProfile()
42+
.setValidPosTags(Set.of("NE", "NNS", "NN", "NNP", "P", "ADJ"))
43+
.setMinPhraseSignificance(2).setNumKeywords(15)
44+
.setKeywordAlgorithm(
45+
"com.weblyzard.backend.jesaja.algorithm.keywords.YatesKeywordSignificanceAlgorithm")
46+
.setMinTokenCount(5).setSkipUnderrepresentedKeywords(true);
47+
48+
public static void main(String[] argv) throws IOException, JAXBException {
49+
OptionsParser parser = OptionsParser.newOptionsParser(KeywordExtractorOption.class);
50+
parser.parseAndExitUponError(argv);
51+
KeywordExtractorOption options = parser.getOptions(KeywordExtractorOption.class);
52+
53+
if (options.printHelp || options.webServiceBaseUrl.isEmpty()
54+
|| options.profileName.isEmpty()) {
55+
printUsage(parser);
56+
return;
57+
}
58+
59+
// setup web services
60+
setupWebServices(options);
61+
62+
// train the component with the provided reference corpus
63+
if (!options.referenceCorpusDirectory.isEmpty()) {
64+
List<Document> documents = getDocuments(options.referenceCorpusDirectory);
65+
trainJesaja(options.profileName, documents);
66+
}
67+
68+
// compute keywords
69+
if (!options.targetCorpusDirectory.isEmpty()) {
70+
List<Document> documents = getDocuments(options.targetCorpusDirectory);
71+
try {
72+
Map<String, Map<String, Double>> keywords =
73+
keywordExtractionClient.getKeywords(options.profileName, documents);
74+
System.out.println(OBJECT_MAPPER.writeValueAsString(keywords));
75+
} catch (WebApplicationException | JAXBException e) {
76+
log.error("Cannot extract keywords: {}", e);
77+
System.exit(-1);
78+
}
79+
}
80+
81+
}
82+
83+
/**
84+
* Setup and configure the Web services based on the provided {@link KeywordExtractorOption}s.
85+
*
86+
* @param options used for the Web service configuration
87+
*/
88+
private static void setupWebServices(KeywordExtractorOption options) {
89+
WebserviceClientConfig jeremiaConfig = new WebserviceClientConfig()
90+
.setUrl(options.webServiceBaseUrl).setUsername(options.webServiceUserName)
91+
.setPassword(options.webServiceUserPassword)
92+
.setUseCompression(Boolean.getBoolean(options.useCompression));
93+
WebserviceClientConfig jesajaConfig = new WebserviceClientConfig()
94+
.setUrl(options.webServiceBaseUrl).setUsername(options.webServiceUserName)
95+
.setPassword(options.webServiceUserPassword)
96+
.setUseCompression(Boolean.getBoolean(options.useCompression));
97+
System.out.println(jeremiaConfig);
98+
// use standard service ports, if the web service has been deployed locally
99+
if (options.webServiceBaseUrl.startsWith("http://localhost")
100+
|| options.webServiceBaseUrl.startsWith("http://127.0.0.1")) {
101+
jeremiaConfig.setServicePrefix(":63001");
102+
jesajaConfig.setServicePrefix(":63002");
103+
}
104+
105+
preProcessingClient = new JeremiaClient(jeremiaConfig);
106+
keywordExtractionClient = new JesajaClient(jesajaConfig);
107+
108+
// setup keyword service configuration
109+
keywordExtractionClient.setKeywordProfile(options.profileName, KEYWORD_PROFILE);
110+
keywordExtractionClient.setMatviewProfile(options.profileName, options.profileName);
111+
}
112+
113+
/**
114+
* Train the keyword extraction service with the documents provided in the reference corpus.
115+
*
116+
* @param profileName the name of the profile to train
117+
* @param documents the {@link Document}s used for training
118+
*/
119+
private static void trainJesaja(String profileName, List<Document> documents) {
120+
try {
121+
while (keywordExtractionClient.rotateShard(profileName) == 0) {
122+
keywordExtractionClient.addDocuments(profileName, documents);
123+
}
124+
} catch (WebApplicationException | JAXBException e) {
125+
log.error("Cannot train keyword service: {}", e);
126+
System.exit(-1);
127+
}
128+
}
129+
130+
/**
131+
* Read all documents from the given directory, perform pre-processing and convert them into a
132+
* list. of {@link Document} objects.
133+
*
134+
* @param documentDirectory the directory containing the documents
135+
* @return the list of {@link Document} objects read from the directory
136+
*/
137+
private static List<Document> getDocuments(String documentDirectory) {
138+
try {
139+
return getDocuments(Files.list(Paths.get(documentDirectory)));
140+
} catch (IOException e) {
141+
log.error("Cannot open corpus directory '{}': {}", documentDirectory, e);
142+
System.exit(-1);
143+
}
144+
return null;
145+
}
146+
147+
/**
148+
* Returns a list of Document objects.
149+
*/
150+
private static List<Document> getDocuments(Stream<Path> documents) {
151+
List<MirrorDocument> inputDocuments = documents.map(documentPath -> {
152+
try {
153+
return new MirrorDocument().setId(documentPath.toString())
154+
.setBody(new String(Files.readAllBytes(documentPath)));
155+
} catch (IOException e) {
156+
log.warn("Cannot open input document '{}': {}", documentPath, e);
157+
return null;
158+
}
159+
}).filter(document -> document != null).collect(Collectors.toList());
160+
161+
// create the input structure for the pre-processing web service
162+
return preProcessingClient.submitDocuments(inputDocuments, "-1");
163+
}
164+
165+
/**
166+
* Provide usage information for the given {@link OptionsParser}.
167+
*/
168+
private static void printUsage(OptionsParser parser) {
169+
System.out.println("Usage: java -jar example-keyword-extractor OPTIONS");
170+
System.out.println(
171+
parser.describeOptions(Collections.emptyMap(), OptionsParser.HelpVerbosity.LONG));
172+
}
173+
176174
}

0 commit comments

Comments
 (0)