init commit

kapraran · kapraran · commit cde3eda0df4e · 2017-05-06T19:16:31.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# Eclipse
+.classpath
+.project
+.settings/
+
+# Intellij
+.idea/
+*.iml
+*.iws
+
+# Mac
+.DS_Store
+
+# Maven
+log/
+target/
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,35 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>HttpLogAnalyser</groupId>
+	<artifactId>HttpLogAnalyser</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+
+	<dependencies>
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-mapreduce-client-core</artifactId>
+			<version>2.7.3</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-common</artifactId>
+			<version>2.7.3</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-hdfs</artifactId>
+			<version>2.7.3</version>
+		</dependency>
+
+		<dependency>
+			<groupId>jdk.tools</groupId>
+			<artifactId>jdk.tools</artifactId>
+			<version>1.8</version>
+			<scope>system</scope>
+			<systemPath>C:/Program Files/Java/jdk1.8.0_121/lib/tools.jar</systemPath>
+		</dependency>
+	</dependencies>
+</project>
diff --git a/src/main/java/csd/auth/ftw/MapperClass.java b/src/main/java/csd/auth/ftw/MapperClass.java
@@ -0,0 +1,81 @@
+package csd.auth.ftw;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+
+public class MapperClass extends Mapper<Text, Text, Text, Text> {
+    public static final String PUNCTUATION = "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~´";
+
+    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
+        String line = value.toString().toLowerCase();
+        
+        // remove whitespace
+        line = removeWhiteSpace(line);
+        
+        // remove punctuation
+        line = removePunctuation(line);
+        
+        // keep unique words
+        HashSet<String> words = new HashSet<String>(Arrays.asList(line.split(" ")));
+        
+        // remove stopwords
+        words = removeStopwords(words);
+       
+        // stem each word
+        ArrayList<String> stemmedWords = new ArrayList<String>();
+        for (String word: words)
+            stemmedWords.add(applyStemming(word));
+        
+        for (String word: stemmedWords) {
+            Text keyWord = new Text(word);
+            Text valueFilename = new Text(getCurrentFilename(context));
+            
+            context.write(keyWord, valueFilename);
+        }
+    }
+    
+    private String removeWhiteSpace(String line) {
+        String pattern = "\\s+";
+        return line.replaceAll(pattern, " ").trim();
+    }
+
+    private String removePunctuation(String str) {
+        String pattern = String.join("|\\", PUNCTUATION.split(""));
+        return str.replaceAll(pattern, "");
+    }
+    
+    private HashSet<String> removeStopwords(HashSet<String> words) throws IOException {
+        List<String> stopwords = Files.readAllLines(Paths.get("C:\\users\\nikos\\stopwords.txt"), StandardCharsets.UTF_8);
+        HashSet<String> filteredWords = new HashSet<>();
+        
+        for (String word: words) {
+            if (!stopwords.contains(word))
+                filteredWords.add(word);
+        }
+        
+        return filteredWords;
+    }
+    
+    private String applyStemming(String word) {
+        Stemmer stemmer = new Stemmer();
+        stemmer.add(word.toCharArray(), word.length());
+        stemmer.stem();
+        
+        return stemmer.toString();
+    }
+    
+    private String getCurrentFilename(Context context) {
+        return ((FileSplit) context.getInputSplit()).getPath().getName();
+    }
+}
diff --git a/src/main/java/csd/auth/ftw/ReducerClass.java b/src/main/java/csd/auth/ftw/ReducerClass.java
@@ -0,0 +1,50 @@
+package csd.auth.ftw;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+public class ReducerClass extends Reducer<Text, Text, Text, Text> {
+    private ArrayList<String> inputFiles = null;
+    
+    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException {
+        resolveInputFiles(context);
+        
+        int[] disList = new int[inputFiles.size()];
+        Arrays.fill(disList, 0);
+        
+        for (Text filename: values) {
+            int id = getFilenameId(filename);
+            disList[id] = 1;
+        }
+        
+        // TODO write distlist
+//        context.write(key, arg1);
+    }
+    
+    private void resolveInputFiles(Context context) throws IOException {
+        if (inputFiles != null)
+            return;
+        
+        inputFiles = new ArrayList<>();
+        
+        FileSystem fs = FileSystem.get(context.getConfiguration());
+        String filename = context.getConfiguration().get("map.input.dir");
+        FileStatus[] filesStatuses = fs.listStatus(new Path(filename));
+        
+        for (int i=0; i<filesStatuses.length; i++) {
+            FileStatus status = filesStatuses[i];
+            inputFiles.add(status.getPath().toString());
+        }
+    }
+    
+    private int getFilenameId(Text filename) {
+        return inputFiles.indexOf(filename.toString());
+    }
+}
diff --git a/src/main/java/csd/auth/ftw/Stemmer.java b/src/main/java/csd/auth/ftw/Stemmer.java
diff --git a/src/main/java/csd/auth/ftw/TextClustering.java b/src/main/java/csd/auth/ftw/TextClustering.java