Skip to content

Commit cde3eda

Browse files
committed
init commit
0 parents  commit cde3eda

File tree

6 files changed

+645
-0
lines changed

6 files changed

+645
-0
lines changed

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Eclipse
2+
.classpath
3+
.project
4+
.settings/
5+
6+
# Intellij
7+
.idea/
8+
*.iml
9+
*.iws
10+
11+
# Mac
12+
.DS_Store
13+
14+
# Maven
15+
log/
16+
target/

pom.xml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
<groupId>HttpLogAnalyser</groupId>
5+
<artifactId>HttpLogAnalyser</artifactId>
6+
<version>0.0.1-SNAPSHOT</version>
7+
8+
<dependencies>
9+
<dependency>
10+
<groupId>org.apache.hadoop</groupId>
11+
<artifactId>hadoop-mapreduce-client-core</artifactId>
12+
<version>2.7.3</version>
13+
</dependency>
14+
15+
<dependency>
16+
<groupId>org.apache.hadoop</groupId>
17+
<artifactId>hadoop-common</artifactId>
18+
<version>2.7.3</version>
19+
</dependency>
20+
21+
<dependency>
22+
<groupId>org.apache.hadoop</groupId>
23+
<artifactId>hadoop-hdfs</artifactId>
24+
<version>2.7.3</version>
25+
</dependency>
26+
27+
<dependency>
28+
<groupId>jdk.tools</groupId>
29+
<artifactId>jdk.tools</artifactId>
30+
<version>1.8</version>
31+
<scope>system</scope>
32+
<systemPath>C:/Program Files/Java/jdk1.8.0_121/lib/tools.jar</systemPath>
33+
</dependency>
34+
</dependencies>
35+
</project>
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package csd.auth.ftw;
2+
3+
import java.io.IOException;
4+
import java.nio.charset.StandardCharsets;
5+
import java.nio.file.Files;
6+
import java.nio.file.Paths;
7+
import java.util.ArrayList;
8+
import java.util.Arrays;
9+
import java.util.HashSet;
10+
import java.util.List;
11+
12+
import org.apache.hadoop.fs.FileSystem;
13+
import org.apache.hadoop.io.Text;
14+
import org.apache.hadoop.mapreduce.Mapper;
15+
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
16+
17+
public class MapperClass extends Mapper<Text, Text, Text, Text> {
18+
public static final String PUNCTUATION = "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘…„—´";
19+
20+
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
21+
String line = value.toString().toLowerCase();
22+
23+
// remove whitespace
24+
line = removeWhiteSpace(line);
25+
26+
// remove punctuation
27+
line = removePunctuation(line);
28+
29+
// keep unique words
30+
HashSet<String> words = new HashSet<String>(Arrays.asList(line.split(" ")));
31+
32+
// remove stopwords
33+
words = removeStopwords(words);
34+
35+
// stem each word
36+
ArrayList<String> stemmedWords = new ArrayList<String>();
37+
for (String word: words)
38+
stemmedWords.add(applyStemming(word));
39+
40+
for (String word: stemmedWords) {
41+
Text keyWord = new Text(word);
42+
Text valueFilename = new Text(getCurrentFilename(context));
43+
44+
context.write(keyWord, valueFilename);
45+
}
46+
}
47+
48+
private String removeWhiteSpace(String line) {
49+
String pattern = "\\s+";
50+
return line.replaceAll(pattern, " ").trim();
51+
}
52+
53+
private String removePunctuation(String str) {
54+
String pattern = String.join("|\\", PUNCTUATION.split(""));
55+
return str.replaceAll(pattern, "");
56+
}
57+
58+
private HashSet<String> removeStopwords(HashSet<String> words) throws IOException {
59+
List<String> stopwords = Files.readAllLines(Paths.get("C:\\users\\nikos\\stopwords.txt"), StandardCharsets.UTF_8);
60+
HashSet<String> filteredWords = new HashSet<>();
61+
62+
for (String word: words) {
63+
if (!stopwords.contains(word))
64+
filteredWords.add(word);
65+
}
66+
67+
return filteredWords;
68+
}
69+
70+
private String applyStemming(String word) {
71+
Stemmer stemmer = new Stemmer();
72+
stemmer.add(word.toCharArray(), word.length());
73+
stemmer.stem();
74+
75+
return stemmer.toString();
76+
}
77+
78+
private String getCurrentFilename(Context context) {
79+
return ((FileSplit) context.getInputSplit()).getPath().getName();
80+
}
81+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package csd.auth.ftw;
2+
3+
import java.io.IOException;
4+
import java.util.ArrayList;
5+
import java.util.Arrays;
6+
7+
import org.apache.hadoop.fs.FileStatus;
8+
import org.apache.hadoop.fs.FileSystem;
9+
import org.apache.hadoop.fs.Path;
10+
import org.apache.hadoop.io.Text;
11+
import org.apache.hadoop.mapreduce.Reducer;
12+
13+
public class ReducerClass extends Reducer<Text, Text, Text, Text> {
14+
private ArrayList<String> inputFiles = null;
15+
16+
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException {
17+
resolveInputFiles(context);
18+
19+
int[] disList = new int[inputFiles.size()];
20+
Arrays.fill(disList, 0);
21+
22+
for (Text filename: values) {
23+
int id = getFilenameId(filename);
24+
disList[id] = 1;
25+
}
26+
27+
// TODO write distlist
28+
// context.write(key, arg1);
29+
}
30+
31+
private void resolveInputFiles(Context context) throws IOException {
32+
if (inputFiles != null)
33+
return;
34+
35+
inputFiles = new ArrayList<>();
36+
37+
FileSystem fs = FileSystem.get(context.getConfiguration());
38+
String filename = context.getConfiguration().get("map.input.dir");
39+
FileStatus[] filesStatuses = fs.listStatus(new Path(filename));
40+
41+
for (int i=0; i<filesStatuses.length; i++) {
42+
FileStatus status = filesStatuses[i];
43+
inputFiles.add(status.getPath().toString());
44+
}
45+
}
46+
47+
private int getFilenameId(Text filename) {
48+
return inputFiles.indexOf(filename.toString());
49+
}
50+
}

0 commit comments

Comments
 (0)