Merge branch 'release/2.1.5'

Andrew Clegg · Andrew Clegg · commit 2d077df5aeaa · 2013-09-11T13:26:37.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,17 @@
+Release 2.1.5
+-------------
+
+More documentation improvements, code cleanup.
+
+Workaround for a compiler issue in some versions of Java 6:
+
+https://github.com/pearson-enabling-technologies/elasticsearch-approx-plugin/issues/41
+
+Release 2.1.4
+-------------
+
+Documentation improvements.
+
 Release 2.1.3
 -------------
 
diff --git a/README.md b/README.md
@@ -17,11 +17,11 @@ Plugin < 1.3.0: ElasticSearch 0.19.X, tested on 0.19.11
 
 Plugin 1.3.X: ElasticSearch 0.20.X, tested on 0.20.6
 
-Plugin 2.1.4: ElasticSearch 0.90.2, plus significant feature and performance improvements, and breaking API changes
+Plugin 2.1.5: ElasticSearch 0.90.2, plus significant feature and performance improvements, and breaking API changes, compared to 1.3.X branch
 
 ElasticSearch 0.90.3 is not supported yet.
 
-**N.B.** If you are upgrading from a previous version to 2.1.0, please read the
+**N.B.** If you are upgrading from a previous version to 2.1.X, please read the
 following carefully, as the syntax (and semantics) have changed in several places.
 
 
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 	<modelVersion>4.0.0</modelVersion>
 	<groupId>com.pearson.entech</groupId>
 	<artifactId>elasticsearch-approx-plugin</artifactId>
-	<version>2.1.4</version>
+	<version>2.1.5</version>
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
diff --git a/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/collectors/TimestampFirstCollector.java b/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/collectors/TimestampFirstCollector.java
@@ -17,8 +17,16 @@
 import org.elasticsearch.index.fielddata.ScriptDocValues;
 import org.elasticsearch.index.fielddata.plain.LongArrayIndexFieldData;
 
+/**
+ * A buildable collector which iterates through value of a long datetime field, applying timezone rounding to them.
+ *  
+ * @param <V> the IndexFieldData type of the datetime field 
+ */
 public abstract class TimestampFirstCollector<V extends AtomicFieldData<? extends ScriptDocValues>> extends BuildableCollector {
 
+    /**
+     * An empty iterator over long values. 
+     */
     protected static final Iter EMPTY = new Iter.Empty();
 
     private LongValues _keyFieldValues;
@@ -36,20 +44,34 @@ public abstract class TimestampFirstCollector<V extends AtomicFieldData<? extend
     private BytesValues _valueFieldValues;
     private BytesValues.Iter _valueFieldIter;
 
+    /**
+     * Create a new collector.
+     * 
+     * @param keyFieldData key (datetime) field data
+     * @param valueFieldData value field data
+     * @param tzRounding time zone rounding
+     */
     public TimestampFirstCollector(final LongArrayIndexFieldData keyFieldData,
             final IndexFieldData<V> valueFieldData, final TimeZoneRounding tzRounding) {
         _keyFieldData = keyFieldData;
         _valueFieldData = valueFieldData;
         _tzRounding = tzRounding;
     }
 
+    /**
+     * Create a new collector.
+     * 
+     * @param keyFieldData key (datetime) field data
+     * @param tzRounding time zone rounding
+     */
     public TimestampFirstCollector(final LongArrayIndexFieldData keyFieldData,
             final TimeZoneRounding tzRounding) {
         this(keyFieldData, null, tzRounding);
     }
 
     @Override
     public void collect(final int doc) throws IOException {
+        // If the datetime field has ordinals available, we can take a bunch of shortcuts later
         if(_keyFieldValues instanceof WithOrdinals) {
             _docOrds = ((WithOrdinals) _keyFieldValues).ordinals().getOrds(doc);
             _docOrdPointer = _docOrds.offset;
@@ -66,16 +88,35 @@ public void setNextReader(final AtomicReaderContext context) throws IOException
         if(hasValueField())
             _valueFieldValues = _valueFieldData.load(context).getBytesValues();
 
+        // If we have ordinals avilable, we can do most of the work up front.
+        // We build a mapping from ords to rounded timestamps, so we never
+        // have to retrieve the field values for a given document. We just
+        // see which ordinals it has and then get the rounded timestamps they
+        // correspond to.
+
+        // One drawback of this approach is that if we have a very aggressively
+        // filtered query, there might be many ordinals which are never used by
+        // any of the documents we will be looking at. So we'd be wasting effort
+        // by calculating timestamps for all of the ordinals up front.
+        // TODO come up with a heuristic to avoid falling into this trap.
+
         if(_keyFieldValues instanceof WithOrdinals) {
             final int maxOrd = ((WithOrdinals) _keyFieldValues).ordinals().getMaxOrd();
             int tsPointer = 0;
+
+            // _timestamps holds the rounded timestamps
             _timestamps.resetQuick();
             _timestamps.add(0);
+
+            // _ordToTimestampPointers has one entry for every ord
             _ordToTimestampPointers.resetQuick();
             _ordToTimestampPointers.add(0);
+
+            // We cache these for some small optimizations
             long lastDateTime = 0;
             long lastTimestamp = 0;
             for(int i = 1; i < maxOrd; i++) {
+                // Get the next ordinal's value so we can calculate its timestamp
                 final long datetime = ((WithOrdinals) _keyFieldValues).getValueByOrd(i);
 
                 // If this datetime is less than a second after the previously-seen timestamp, it will have the same timestamp
@@ -95,6 +136,8 @@ public void setNextReader(final AtomicReaderContext context) throws IOException
                     }
                 }
                 lastDateTime = datetime;
+
+                // Add timestamp pointer for this ord -- could be the same as the previous ord, or a new one
                 _ordToTimestampPointers.add(tsPointer);
             }
         } else {
@@ -105,6 +148,11 @@ public void setNextReader(final AtomicReaderContext context) throws IOException
     @Override
     public void postCollection() {}
 
+    /**
+     * Are there any more timestamps available?
+     * 
+     * @return true/false
+     */
     protected boolean hasNextTimestamp() {
         if(_keyFieldValues instanceof WithOrdinals) {
             return _docOrdPointer < _docOrds.length;
@@ -113,12 +161,19 @@ protected boolean hasNextTimestamp() {
         }
     }
 
+    /**
+     * Get the next timestamp, i.e. the rounded value of the next available datetime.
+     * 
+     * @return the timestamp
+     */
     protected long nextTimestamp() {
         if(_keyFieldValues instanceof WithOrdinals) {
+            // We can bypass getting the raw datetime value, and go from ord to timestamp directly (well, directly-ish)
             final long ts = _timestamps.get(_ordToTimestampPointers.get(_docOrds.ints[_docOrdPointer]));
             _docOrdPointer++;
             return ts;
         } else {
+            // Get the next raw datetime, and if necessary, round it
             final long datetime = _docIter.next();
             // If this datetime is less than a second after the previously-seen timestamp, it will have the same timestamp
             // (true because we don't support granularity less than 1 sec)
@@ -134,14 +189,31 @@ protected long nextTimestamp() {
         }
     }
 
+    /**
+     * Returns true if this iterator is getting each timestamp once per value of a value field.
+     * Otherwise, it's getting each timestamp once per document.
+     * 
+     * @return true/false
+     */
     protected boolean hasValueField() {
         return _valueFieldData != null;
     }
 
+    /**
+     * Returns true if there is another value of a value field available, for the current doc.
+     * If there isn't, or we're not using a value field, returns false.
+     * 
+     * @return true/false
+     */
     protected boolean hasNextValue() {
         return _valueFieldIter != null && _valueFieldIter.hasNext();
     }
 
+    /**
+     * Gets the next value of the value field, or null if we're not using a value field.
+     * 
+     * @return the next value as a BytesRef, or null
+     */
     protected BytesRef nextValue() {
         return _valueFieldIter == null ? null : _valueFieldIter.next();
     }
diff --git a/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/external/XContentEnabledList.java b/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/external/XContentEnabledList.java
@@ -8,31 +8,65 @@
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentBuilderString;
 
+/**
+ * An ArrayList that implements ToXContent too. Has a name
+ * attribute which is used as its fieldname in XContent output. Then
+ * the list elements are rendered as an XContent list using their own
+ * toXContent() methods.
+ * 
+ * @param <E> list element type; must in turn implement ToXContent
+ */
 public class XContentEnabledList<E extends ToXContent>
         extends ArrayList<E> implements ToXContent {
 
+    private static final long serialVersionUID = 1L;
+
     private final String _name;
 
     private final XContentBuilderString _xName;
 
+    /**
+     * Create a list by copying in the values of the other collection.
+     * 
+     * @param data the collection to copy
+     * @param name the name of the new list
+     */
     public XContentEnabledList(final Collection<? extends E> data, final String name) {
         super(data);
         _name = name;
         _xName = null;
     }
 
+    /**
+     * Create a list with the initial capacity specified.
+     * 
+     * @param initialCapacity the starting capacity
+     * @param name the name of the new list
+     */
     public XContentEnabledList(final int initialCapacity, final String name) {
         super(initialCapacity);
         _name = name;
         _xName = null;
     }
 
+    /**
+     * Create an empty list.
+     * 
+     * @param name the name of the new list
+     */
     public XContentEnabledList(final String name) {
         _name = name;
         _xName = null;
     }
 
-    public XContentEnabledList(final int size, final XContentBuilderString name) {
+    /**
+     * Create a list with the initial capacity specified.
+     * 
+     * @param initialCapacity the starting capacity
+     * @param name the name of the new list, as XContent
+     */
+    public XContentEnabledList(final int initialCapacity, final XContentBuilderString name) {
+        super(initialCapacity);
         _name = null;
         _xName = name;
     }
diff --git a/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/DateFacetExecutor.java b/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/DateFacetExecutor.java
@@ -2,7 +2,6 @@
 
 import org.elasticsearch.common.joda.TimeZoneRounding;
 import org.elasticsearch.index.fielddata.IndexFieldData;
-import org.elasticsearch.index.fielddata.LongValues.Iter;
 import org.elasticsearch.index.fielddata.plain.LongArrayIndexFieldData;
 import org.elasticsearch.search.facet.FacetExecutor;
 import org.elasticsearch.search.facet.FacetPhaseExecutionException;
@@ -15,49 +14,45 @@
 import com.pearson.entech.elasticsearch.search.facet.approx.date.collectors.SlicedDistinctCollector;
 import com.pearson.entech.elasticsearch.search.facet.approx.date.collectors.TimestampFirstCollector;
 
+/**
+ * Executor for all date facets.
+ */
 public class DateFacetExecutor extends FacetExecutor {
 
-    private static final Iter __emptyIter = new Iter.Empty();
+    private final TimestampFirstCollector<?> _collector;
 
-    private final LongArrayIndexFieldData _keyFieldData;
-    private final IndexFieldData _valueFieldData;
-    private final IndexFieldData _distinctFieldData;
-    private final IndexFieldData _sliceFieldData;
+    // TODO proper use of generics
 
-    private final TimestampFirstCollector _collector;
+    /**
+     * Create a new executor.
+     * 
+     * @param keyFieldData field data for the datetime field used for timestamps
+     * @param valueFieldData field data for the optional value field, can be null
+     * @param distinctFieldData field data for the optional distinct field, can be null
+     * @param sliceFieldData field data for the optional slice field, can be null
+     * @param tzRounding a timezone rounding object
+     * @param exactThreshold exact count threshold when doing distincts
+     */
+    public DateFacetExecutor(final LongArrayIndexFieldData keyFieldData, final IndexFieldData<?> valueFieldData,
+            final IndexFieldData<?> distinctFieldData, final IndexFieldData<?> sliceFieldData,
+            final TimeZoneRounding tzRounding, final int exactThreshold) {
 
-    private final TimeZoneRounding _tzRounding;
-
-    private final int _exactThreshold;
-
-    public DateFacetExecutor(final LongArrayIndexFieldData keyFieldData, final IndexFieldData valueFieldData,
-            final IndexFieldData distinctFieldData, final IndexFieldData sliceFieldData,
-            final TimeZoneRounding tzRounding, final int exactThreshold, final boolean debug) {
-        _keyFieldData = keyFieldData;
-        _valueFieldData = valueFieldData;
-        _distinctFieldData = distinctFieldData;
-        _sliceFieldData = sliceFieldData;
-        _tzRounding = tzRounding;
-        _exactThreshold = exactThreshold;
-
-        // TODO type safety for the following constructors
-
-        if(_distinctFieldData == null && _sliceFieldData == null)
-            if(_valueFieldData == null)
+        if(distinctFieldData == null && sliceFieldData == null)
+            if(valueFieldData == null)
                 _collector = new CountingCollector<NullFieldData>(keyFieldData, tzRounding);
             else
-                _collector = new CountingCollector(keyFieldData, _valueFieldData, tzRounding);
-        else if(_distinctFieldData == null)
-            if(_valueFieldData == null)
+                _collector = new CountingCollector(keyFieldData, valueFieldData, tzRounding);
+        else if(distinctFieldData == null)
+            if(valueFieldData == null)
                 _collector = new SlicedCollector(keyFieldData, sliceFieldData, tzRounding);
             else
                 _collector = new SlicedCollector(keyFieldData, valueFieldData, sliceFieldData, tzRounding);
-        else if(_sliceFieldData == null)
-            if(_valueFieldData == null)
+        else if(sliceFieldData == null)
+            if(valueFieldData == null)
                 _collector = new DistinctCollector(keyFieldData, distinctFieldData, tzRounding, exactThreshold);
             else
                 throw new FacetPhaseExecutionException("unknown date_facet", "Can't use distinct_field and value_field together");
-        else if(_valueFieldData == null)
+        else if(valueFieldData == null)
             _collector = new SlicedDistinctCollector(keyFieldData, sliceFieldData, distinctFieldData, tzRounding, exactThreshold);
         else
             throw new FacetPhaseExecutionException("unknown date_facet", "Can't use distinct_field and value_field together");
diff --git a/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/DateFacetParser.java b/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/DateFacetParser.java
diff --git a/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/InternalSlicedDistinctFacet.java b/src/main/java/com/pearson/entech/elasticsearch/search/facet/approx/date/internal/InternalSlicedDistinctFacet.java