FAQ
Does SimpleFacetsTest fail for you?

I svn up'ed and i see a few failures
On Fri, Sep 3, 2010 at 1:12 PM, wrote:

Author: yonik
Date: Fri Sep 3 17:12:26 2010
New Revision: 992382

URL: http://svn.apache.org/viewvc?rev=992382&view=rev
Log:
SOLR-2092: use native long PQ to order facet results

Added:

lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
(with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java

lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=992382&r1=992381&r2=992382&view=diff

==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri Sep 3 17:12:26 2010
@@ -288,6 +288,12 @@ Optimizations
* SOLR-2046: Simplify legacy replication scripts by adding common
functions
to scripts-util. (koji)

+* SOLR-2092: Speed up single-valued and multi-valued "fc" faceting.
Typical
+ improvement is 5%, but can be much greater (up to 10x faster) when
facet.offset
+ is very large (deep paging). (yonik)
+
+
+
Bug Fixes
----------------------


Modified:
lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java
URL:
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=992382&r1=992381&r2=992382&view=diff

==============================================================================
---
lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java
(original)
+++
lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java Fri
Sep 3 17:12:26 2010
@@ -44,6 +44,7 @@ import org.apache.solr.util.BoundedTreeS
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.DateMathParser;
import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.util.LongPriorityQueue;

import java.io.IOException;
import java.util.*;
@@ -416,9 +417,9 @@ public class SimpleFacets {
}

final int nTerms=endTermIndex-startTermIndex;
+ int missingCount = -1;

CharArr spare = new CharArr();
-
if (nTerms>0 && docs.size() >= mincount) {

// count collection array only needs to be as big as the number of
terms we are
@@ -475,6 +476,10 @@ public class SimpleFacets {
}
}

+ if (startTermIndex == 0) {
+ missingCount = counts[0];
+ }
+
// IDEA: we could also maintain a count of "other"... everything that
fell outside
// of the top 'N'

@@ -484,7 +489,8 @@ public class SimpleFacets {
if (sort.equals(FacetParams.FACET_SORT_COUNT) ||
sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
maxsize = Math.min(maxsize, nTerms);
- final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = new
BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize);
+ LongPriorityQueue queue = new
LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
+
int min=mincount-1; // the smallest value in the top 'N' values
for (int i=(startTermIndex==0)?1:0; i<nTerms; i++) {
int c = counts[i];
@@ -492,18 +498,33 @@ public class SimpleFacets {
// NOTE: we use c>min rather than c>=min as an optimization
because we are going in
// index order, so we already know that the keys are ordered.
This can be very
// important if a lot of the counts are repeated (like zero
counts would be).
- queue.add(new
CountPair<BytesRef,Integer>(si.lookup(startTermIndex+i, new BytesRef()),
c));
- if (queue.size()>=maxsize) min=queue.last().val;
+
+ // smaller term numbers sort higher, so subtract the term
number instead
+ long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i);
+ boolean displaced = queue.insert(pair);
+ if (displaced) min=(int)(queue.top() >>> 32);
}
}
- // now select the right page from the results
- for (CountPair<BytesRef,Integer> p : queue) {
- if (--off>=0) continue;
- if (--lim<0) break;
+
+ // if we are deep paging, we don't have to order the highest
"offset" counts.
+ int collectCount = Math.max(0, queue.size() - off);
+ assert collectCount < lim;
+
+ // the start and end indexes of our list "sorted" (starting with
the highest value)
+ int sortedIdxStart = queue.size() - (collectCount - 1);
+ int sortedIdxEnd = queue.size() + 1;
+ final long[] sorted = queue.sort(collectCount);
+
+ for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
+ long pair = sorted[i];
+ int c = (int)(pair >>> 32);
+ int tnum = Integer.MAX_VALUE - (int)pair;
+
spare.reset();
- ft.indexedToReadable(p.key, spare);
- res.add(spare.toString(), p.val);
+ ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), spare);
+ res.add(spare.toString(), c);
}
+
} else {
// add results in index order
int i=(startTermIndex==0)?1:0;
@@ -526,7 +547,10 @@ public class SimpleFacets {
}

if (missing) {
- res.add(null, getFieldMissingCount(searcher,docs,fieldName));
+ if (missingCount < 0) {
+ missingCount = getFieldMissingCount(searcher,docs,fieldName);
+ }
+ res.add(null, missingCount);
}

return res;

Modified:
lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
URL:
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java?rev=992382&r1=992381&r2=992382&view=diff

==============================================================================
---
lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
(original)
+++
lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
Fri Sep 3 17:12:26 2010
@@ -37,6 +37,7 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.*;
+import org.apache.solr.util.LongPriorityQueue;
import org.apache.solr.util.PrimUtils;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.handler.component.StatsValues;
@@ -470,7 +471,9 @@ public class UnInvertedField {
if (baseSize >= mincount) {

final int[] index = this.index;
- final int[] counts = new int[numTermsInField];
+ // tricky: we add more more element than we need because we will
reuse this array later
+ // for ordering term ords before converting to term labels.
+ final int[] counts = new int[numTermsInField + 1];

//
// If there is prefix, find it's start and end term numbers
@@ -575,7 +578,8 @@ public class UnInvertedField {
if (sort.equals(FacetParams.FACET_SORT_COUNT) ||
sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
maxsize = Math.min(maxsize, numTermsInField);
- final BoundedTreeSet<Long> queue = new
BoundedTreeSet<Long>(maxsize);
+ LongPriorityQueue queue = new
LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
+
int min=mincount-1; // the smallest value in the top 'N' values
for (int i=startTerm; i<endTerm; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
@@ -584,55 +588,63 @@ public class UnInvertedField {
// index order, so we already know that the keys are ordered.
This can be very
// important if a lot of the counts are repeated (like zero
counts would be).

- // minimize object creation and speed comparison by creating a
long that
- // encompasses both count and term number.
- // Since smaller values are kept in the TreeSet, make higher
counts smaller.
- //
- // for equal counts, lower term numbers
- // should come first and hence be "greater"
-
- //long pair = (((long)c)<<32) | (0x7fffffff-i) ; // use if
priority queue
- long pair = (((long)-c)<<32) | i;
- queue.add(new Long(pair));
- if (queue.size()>=maxsize) min=-(int)(queue.last().longValue()
32);
+ // smaller term numbers sort higher, so subtract the term
number instead
+ long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i);
+ boolean displaced = queue.insert(pair);
+ if (displaced) min=(int)(queue.top() >>> 32);
}
}
+
// now select the right page from the results

+ // if we are deep paging, we don't have to order the highest
"offset" counts.
+ int collectCount = Math.max(0, queue.size() - off);
+ assert collectCount < lim;
+
+ // the start and end indexes of our list "sorted" (starting with
the highest value)
+ int sortedIdxStart = queue.size() - (collectCount - 1);
+ int sortedIdxEnd = queue.size() + 1;
+ final long[] sorted = queue.sort(collectCount);

- final int[] tnums = new int[Math.min(Math.max(0,
queue.size()-off), lim)];
final int[] indirect = counts; // reuse the counts array for the
index into the tnums array
- assert indirect.length >= tnums.length;
-
- int tnumCount = 0;
+ assert indirect.length >= sortedIdxEnd;
+
+ for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
+ long pair = sorted[i];
+ int c = (int)(pair >>> 32);
+ int tnum = Integer.MAX_VALUE - (int)pair;
+
+ indirect[i] = i; // store the index for indirect sorting
+ sorted[i] = tnum; // reuse the "sorted" array to store the term
numbers for indirect sorting

- for (Long p : queue) {
- if (--off>=0) continue;
- if (--lim<0) break;
- int c = -(int)(p.longValue() >>> 32);
- //int tnum = 0x7fffffff - (int)p.longValue(); // use if
priority queue
- int tnum = (int)p.longValue();
- indirect[tnumCount] = tnumCount;
- tnums[tnumCount++] = tnum;
- // String label = ft.indexedToReadable(getTermText(te, tnum));
// add a null label for now... we'll fill it in later.
res.add(null, c);
}

// now sort the indexes by the term numbers
- PrimUtils.sort(0, tnumCount, indirect, new
PrimUtils.IntComparator() {
+ PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new
PrimUtils.IntComparator() {
@Override
public int compare(int a, int b) {
- return tnums[a] - tnums[b];
+ return (int)sorted[a] - (int)sorted[b];
+ }
+
+ @Override
+ public boolean lessThan(int a, int b) {
+ return sorted[a] < sorted[b];
+ }
+
+ @Override
+ public boolean equals(int a, int b) {
+ return sorted[a] == sorted[b];
}
});

// convert the term numbers to term values and set as the label
- for (int i=0; i<tnumCount; i++) {
+ for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
int idx = indirect[i];
- int tnum = tnums[idx];
- String label = getReadableValue(getTermValue(te, tnum), ft,
spare);
- res.setName(idx, label);
+ int tnum = (int)sorted[idx];
+ String label = getReadableValue(getTermValue(te, tnum), ft,
spare);
+ res.setName(idx - sortedIdxStart, label);
}

} else {

Added:
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
URL:
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java?rev=992382&view=auto

==============================================================================
---
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
(added)
+++
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
Fri Sep 3 17:12:26 2010
@@ -0,0 +1,235 @@
+package org.apache.solr.util;
+
+import java.util.Arrays;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** A native long priority queue.
+ *
+ * @lucene.internal
+*/
+public class LongPriorityQueue {
+ protected int size; // number of elements currently in the
queue
+ protected int currentCapacity; // number of elements the queue can hold
w/o expanding
+ protected int maxSize; // max number of elements allowed in the
queue
+ protected long[] heap;
+ protected final long sentinel; // represents a null return value
+
+ public LongPriorityQueue(int initialSize, int maxSize, long sentinel) {
+ this.maxSize = maxSize;
+ this.sentinel = sentinel;
+ initialize(initialSize);
+ }
+
+
+ protected void initialize(int sz) {
+ int heapSize;
+ if (0 == sz)
+ // We allocate 1 extra to avoid if statement in top()
+ heapSize = 2;
+ else {
+ // NOTE: we add +1 because all access to heap is
+ // 1-based not 0-based. heap[0] is unused.
+ heapSize = Math.max(sz, sz + 1); // handle overflow
+ }
+ heap = new long[heapSize];
+ currentCapacity = sz;
+ }
+
+ public int getCurrentCapacity() {
+ return currentCapacity;
+ }
+
+ public void resize(int sz) {
+ int heapSize;
+ if (sz > maxSize) {
+ maxSize = sz;
+ }
+ if (0 == sz)
+ // We allocate 1 extra to avoid if statement in top()
+ heapSize = 2;
+ else {
+ heapSize = Math.max(sz, sz + 1); // handle overflow
+ }
+ heap = Arrays.copyOf(heap, heapSize);
+ currentCapacity = sz;
+ }
+
+ /**
+ * Adds an object to a PriorityQueue in log(size) time. If one tries to
add
+ * more objects than maxSize from initialize an
+ * {@link ArrayIndexOutOfBoundsException} is thrown.
+ *
+ * @return the new 'top' element in the queue.
+ */
+ public long add(long element) {
+ if (size >= currentCapacity) {
+ int newSize = Math.min(currentCapacity <<1, maxSize);
+ if (newSize < currentCapacity) newSize = Integer.MAX_VALUE; //
handle overflow
+ resize(newSize);
+ }
+ size++;
+ heap[size] = element;
+ upHeap();
+ return heap[1];
+ }
+
+ /**
+ * Adds an object to a PriorityQueue in log(size) time. If one tries to
add
+ * more objects than the current capacity, an
+ * {@link ArrayIndexOutOfBoundsException} is thrown.
+ */
+ public void addNoCheck(long element) {
+ ++size;
+ heap[size] = element;
+ upHeap();
+ }
+
+ /**
+ * Adds an object to a PriorityQueue in log(size) time.
+ * It returns the smallest object (if any) that was
+ * dropped off the heap because it was full, or
+ * the sentinel value.
+ *
+ * This can be
+ * the given parameter (in case it is smaller than the
+ * full heap's minimum, and couldn't be added), or another
+ * object that was previously the smallest value in the
+ * heap and now has been replaced by a larger one, or null
+ * if the queue wasn't yet full with maxSize elements.
+ */
+ public long insertWithOverflow(long element) {
+ if (size < maxSize) {
+ add(element);
+ return sentinel;
+ } else if (element > heap[1]) {
+ long ret = heap[1];
+ heap[1] = element;
+ updateTop();
+ return ret;
+ } else {
+ return element;
+ }
+ }
+
+ /** inserts the element and returns true if this element caused another
element
+ * to be dropped from the queue. */
+ public boolean insert(long element) {
+ if (size < maxSize) {
+ add(element);
+ return false;
+ } else if (element > heap[1]) {
+ // long ret = heap[1];
+ heap[1] = element;
+ updateTop();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /** Returns the least element of the PriorityQueue in constant time. */
+ public long top() {
+ return heap[1];
+ }
+
+ /** Removes and returns the least element of the PriorityQueue in
log(size)
+ time. Only valid if size() > 0.
+ */
+ public long pop() {
+ long result = heap[1]; // save first value
+ heap[1] = heap[size]; // move last to first
+ size--;
+ downHeap(); // adjust heap
+ return result;
+ }
+
+ /**
+ * Should be called when the Object at top changes values.
+ * @return the new 'top' element.
+ */
+ public long updateTop() {
+ downHeap();
+ return heap[1];
+ }
+
+ /** Returns the number of elements currently stored in the
PriorityQueue. */
+ public int size() {
+ return size;
+ }
+
+ /** Returns the array used to hold the heap, with the smallest item at
array[1]
+ * and the last (but not necessarily largest) at array[size()]. This
is *not*
+ * fully sorted.
+ */
+ public long[] getInternalArray() {
+ return heap;
+ }
+
+ /** Pops the smallest n items from the heap, placing them in the
internal array at
+ * arr[size] through arr[size-(n-1)] with the smallest (first element
popped)
+ * being at arr[size]. The internal array is returned.
+ */
+ public long[] sort(int n) {
+ while (--n >= 0) {
+ long result = heap[1]; // save first value
+ heap[1] = heap[size]; // move last to first
+ heap[size] = result; // place it last
+ size--;
+ downHeap(); // adjust heap
+ }
+ return heap;
+ }
+
+ /** Removes all entries from the PriorityQueue. */
+ public void clear() {
+ size = 0;
+ }
+
+ private void upHeap() {
+ int i = size;
+ long node = heap[i]; // save bottom node
+ int j = i >>> 1;
+ while (j > 0 && node < heap[j]) {
+ heap[i] = heap[j]; // shift parents down
+ i = j;
+ j = j >>> 1;
+ }
+ heap[i] = node; // install saved node
+ }
+
+ private void downHeap() {
+ int i = 1;
+ long node = heap[i]; // save top node
+ int j = i << 1; // find smaller child
+ int k = j + 1;
+ if (k <= size && heap[k] < heap[j]) {
+ j = k;
+ }
+ while (j <= size && heap[j] < node) {
+ heap[i] = heap[j]; // shift up child
+ i = j;
+ j = i << 1;
+ k = j + 1;
+ if (k <= size && heap[k] < heap[j]) {
+ j = k;
+ }
+ }
+ heap[i] = node; // install saved node
+ }
+}

Propchange:
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

------------------------------------------------------------------------------
svn:eol-style = native

Propchange:
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

------------------------------------------------------------------------------
svn:executable = *

Propchange:
lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL

Modified:
lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
URL:
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java?rev=992382&r1=992381&r2=992382&view=diff

==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
(original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
Fri Sep 3 17:12:26 2010
@@ -51,4 +51,44 @@ public class PrimUtilsTest extends Lucen
}
}

+ public void testLongPriorityQueue() {
+ int maxSize = 100;
+ long[] a = new long[maxSize];
+ long[] discards = new long[maxSize];
+
+ for (int iter=0; iter<100; iter++) {
+ int discardCount = 0;
+ int startSize = r.nextInt(maxSize) + 1;
+ int endSize = startSize==maxSize ? maxSize : startSize +
r.nextInt(maxSize-startSize);
+ int adds = r.nextInt(maxSize+1);
+ // System.out.println("startSize=" + startSize + " endSize=" +
endSize + " adds="+adds);
+ LongPriorityQueue pq = new LongPriorityQueue(startSize, endSize,
Long.MIN_VALUE);
+
+ for (int i=0; i<adds; i++) {
+ long v = r.nextLong();
+ a[i] = v;
+ long out = pq.insertWithOverflow(v);
+ if (i < endSize) {
+ assertEquals(out, Long.MIN_VALUE);
+ } else {
+ discards[discardCount++] = out;
+ }
+ }
+ assertEquals(Math.min(adds,endSize), pq.size());
+ assertEquals(adds, pq.size() + discardCount);
+
+ Arrays.sort(a, 0, adds);
+ Arrays.sort(discards, 0, discardCount);
+ for (int i=0; i<discardCount; i++) {
+ assertEquals(a[i], discards[i]);
+ }
+
+ for (int i=discardCount; i<adds; i++) {
+ assertEquals(a[i], pq.pop());
+ }
+
+ assertEquals(0, pq.size());
+ }
+ }
+
}
\ No newline at end of file


--
Robert Muir
rcmuir@gmail.com

Search Discussions

  • Yonik Seeley at Sep 3, 2010 at 6:34 pm

    On Fri, Sep 3, 2010 at 2:24 PM, Robert Muir wrote:
    Does SimpleFacetsTest fail for you?
    I svn up'ed and i see a few failures
    Ack - it does! I'll fix.
    I'm wondering how that happened... I think I may have previously done a
    ant test -Dtestcase=TestSimpleFacets (notice the misspelling)

    -Yonik
    http://lucenerevolution.org Lucene/Solr Conference, Boston Oct 7-8

    On Fri, Sep 3, 2010 at 1:12 PM, wrote:

    Author: yonik
    Date: Fri Sep  3 17:12:26 2010
    New Revision: 992382

    URL: http://svn.apache.org/viewvc?rev=992382&view=rev
    Log:
    SOLR-2092: use native long PQ to order facet results

    Added:

    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
    (with props)
    Modified:
    lucene/dev/trunk/solr/CHANGES.txt

    lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java

    lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java

    Modified: lucene/dev/trunk/solr/CHANGES.txt
    URL:
    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=992382&r1=992381&r2=992382&view=diff

    ==============================================================================
    --- lucene/dev/trunk/solr/CHANGES.txt (original)
    +++ lucene/dev/trunk/solr/CHANGES.txt Fri Sep  3 17:12:26 2010
    @@ -288,6 +288,12 @@ Optimizations
    * SOLR-2046: Simplify legacy replication scripts by adding common
    functions
    to scripts-util. (koji)

    +* SOLR-2092: Speed up single-valued and multi-valued "fc" faceting.
    Typical
    +  improvement is 5%, but can be much greater (up to 10x faster) when
    facet.offset
    +  is very large (deep paging). (yonik)
    +
    +
    +
    Bug Fixes
    ----------------------


    Modified:
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java
    URL:
    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=992382&r1=992381&r2=992382&view=diff

    ==============================================================================
    ---
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java
    (original)
    +++
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/SimpleFacets.java Fri
    Sep  3 17:12:26 2010
    @@ -44,6 +44,7 @@ import org.apache.solr.util.BoundedTreeS
    import org.apache.solr.util.ByteUtils;
    import org.apache.solr.util.DateMathParser;
    import org.apache.solr.handler.component.ResponseBuilder;
    +import org.apache.solr.util.LongPriorityQueue;

    import java.io.IOException;
    import java.util.*;
    @@ -416,9 +417,9 @@ public class SimpleFacets {
    }

    final int nTerms=endTermIndex-startTermIndex;
    +    int missingCount = -1;

    CharArr spare = new CharArr();
    -
    if (nTerms>0 && docs.size() >= mincount) {

    // count collection array only needs to be as big as the number of
    terms we are
    @@ -475,6 +476,10 @@ public class SimpleFacets {
    }
    }

    +      if (startTermIndex == 0) {
    +        missingCount = counts[0];
    +      }
    +
    // IDEA: we could also maintain a count of "other"... everything
    that fell outside
    // of the top 'N'

    @@ -484,7 +489,8 @@ public class SimpleFacets {
    if (sort.equals(FacetParams.FACET_SORT_COUNT) ||
    sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
    int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
    maxsize = Math.min(maxsize, nTerms);
    -        final BoundedTreeSet<CountPair<BytesRef,Integer>> queue = new
    BoundedTreeSet<CountPair<BytesRef,Integer>>(maxsize);
    +        LongPriorityQueue queue = new
    LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
    +
    int min=mincount-1;  // the smallest value in the top 'N' values
    for (int i=(startTermIndex==0)?1:0; i<nTerms; i++) {
    int c = counts[i];
    @@ -492,18 +498,33 @@ public class SimpleFacets {
    // NOTE: we use c>min rather than c>=min as an optimization
    because we are going in
    // index order, so we already know that the keys are ordered.
    This can be very
    // important if a lot of the counts are repeated (like zero
    counts would be).
    -            queue.add(new
    CountPair<BytesRef,Integer>(si.lookup(startTermIndex+i, new BytesRef()),
    c));
    -            if (queue.size()>=maxsize) min=queue.last().val;
    +
    +            // smaller term numbers sort higher, so subtract the term
    number instead
    +            long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i);
    +            boolean displaced = queue.insert(pair);
    +            if (displaced) min=(int)(queue.top() >>> 32);
    }
    }
    -        // now select the right page from the results
    -        for (CountPair<BytesRef,Integer> p : queue) {
    -          if (--off>=0) continue;
    -          if (--lim<0) break;
    +
    +        // if we are deep paging, we don't have to order the highest
    "offset" counts.
    +        int collectCount = Math.max(0, queue.size() - off);
    +        assert collectCount < lim;
    +
    +        // the start and end indexes of our list "sorted" (starting with
    the highest value)
    +        int sortedIdxStart = queue.size() - (collectCount - 1);
    +        int sortedIdxEnd = queue.size() + 1;
    +        final long[] sorted = queue.sort(collectCount);
    +
    +        for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
    +          long pair = sorted[i];
    +          int c = (int)(pair >>> 32);
    +          int tnum = Integer.MAX_VALUE - (int)pair;
    +
    spare.reset();
    -          ft.indexedToReadable(p.key, spare);
    -          res.add(spare.toString(), p.val);
    +          ft.indexedToReadable(si.lookup(startTermIndex+tnum, br),
    spare);
    +          res.add(spare.toString(), c);
    }
    +
    } else {
    // add results in index order
    int i=(startTermIndex==0)?1:0;
    @@ -526,7 +547,10 @@ public class SimpleFacets {
    }

    if (missing) {
    -      res.add(null, getFieldMissingCount(searcher,docs,fieldName));
    +      if (missingCount < 0) {
    +        missingCount = getFieldMissingCount(searcher,docs,fieldName);
    +      }
    +      res.add(null, missingCount);
    }

    return res;

    Modified:
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
    URL:
    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java?rev=992382&r1=992381&r2=992382&view=diff

    ==============================================================================
    ---
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
    (original)
    +++
    lucene/dev/trunk/solr/src/java/org/apache/solr/request/UnInvertedField.java
    Fri Sep  3 17:12:26 2010
    @@ -37,6 +37,7 @@ import org.apache.solr.core.SolrCore;
    import org.apache.solr.schema.FieldType;
    import org.apache.solr.schema.TrieField;
    import org.apache.solr.search.*;
    +import org.apache.solr.util.LongPriorityQueue;
    import org.apache.solr.util.PrimUtils;
    import org.apache.solr.util.BoundedTreeSet;
    import org.apache.solr.handler.component.StatsValues;
    @@ -470,7 +471,9 @@ public class UnInvertedField {
    if (baseSize >= mincount) {

    final int[] index = this.index;
    -      final int[] counts = new int[numTermsInField];
    +      // tricky: we add more more element than we need because we will
    reuse this array later
    +      // for ordering term ords before converting to term labels.
    +      final int[] counts = new int[numTermsInField + 1];

    //
    // If there is prefix, find it's start and end term numbers
    @@ -575,7 +578,8 @@ public class UnInvertedField {
    if (sort.equals(FacetParams.FACET_SORT_COUNT) ||
    sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
    int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
    maxsize = Math.min(maxsize, numTermsInField);
    -        final BoundedTreeSet<Long> queue = new
    BoundedTreeSet<Long>(maxsize);
    +        LongPriorityQueue queue = new
    LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
    +
    int min=mincount-1;  // the smallest value in the top 'N' values
    for (int i=startTerm; i<endTerm; i++) {
    int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
    @@ -584,55 +588,63 @@ public class UnInvertedField {
    // index order, so we already know that the keys are ordered.
    This can be very
    // important if a lot of the counts are repeated (like zero
    counts would be).

    -            // minimize object creation and speed comparison by creating
    a long that
    -            // encompasses both count and term number.
    -            // Since smaller values are kept in the TreeSet, make higher
    counts smaller.
    -            //
    -            //   for equal counts, lower term numbers
    -            // should come first and hence be "greater"
    -
    -            //long pair = (((long)c)<<32) | (0x7fffffff-i) ;   // use if
    priority queue
    -            long pair = (((long)-c)<<32) | i;
    -            queue.add(new Long(pair));
    -            if (queue.size()>=maxsize)
    min=-(int)(queue.last().longValue() >>> 32);
    +            // smaller term numbers sort higher, so subtract the term
    number instead
    +            long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i);
    +            boolean displaced = queue.insert(pair);
    +            if (displaced) min=(int)(queue.top() >>> 32);
    }
    }
    +
    // now select the right page from the results

    +        // if we are deep paging, we don't have to order the highest
    "offset" counts.
    +        int collectCount = Math.max(0, queue.size() - off);
    +        assert collectCount < lim;
    +
    +        // the start and end indexes of our list "sorted" (starting with
    the highest value)
    +        int sortedIdxStart = queue.size() - (collectCount - 1);
    +        int sortedIdxEnd = queue.size() + 1;
    +        final long[] sorted = queue.sort(collectCount);

    -        final int[] tnums = new int[Math.min(Math.max(0,
    queue.size()-off), lim)];
    final int[] indirect = counts;  // reuse the counts array for the
    index into the tnums array
    -        assert indirect.length >= tnums.length;
    -
    -        int tnumCount = 0;
    +        assert indirect.length >= sortedIdxEnd;
    +
    +        for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
    +          long pair = sorted[i];
    +          int c = (int)(pair >>> 32);
    +          int tnum = Integer.MAX_VALUE - (int)pair;
    +
    +          indirect[i] = i;   // store the index for indirect sorting
    +          sorted[i] = tnum;  // reuse the "sorted" array to store the
    term numbers for indirect sorting

    -        for (Long p : queue) {
    -          if (--off>=0) continue;
    -          if (--lim<0) break;
    -          int c = -(int)(p.longValue() >>> 32);
    -          //int tnum = 0x7fffffff - (int)p.longValue();  // use if
    priority queue
    -          int tnum = (int)p.longValue();
    -          indirect[tnumCount] = tnumCount;
    -          tnums[tnumCount++] = tnum;
    -          // String label = ft.indexedToReadable(getTermText(te, tnum));
    // add a null label for now... we'll fill it in later.
    res.add(null, c);
    }

    // now sort the indexes by the term numbers
    -        PrimUtils.sort(0, tnumCount, indirect, new
    PrimUtils.IntComparator() {
    +        PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new
    PrimUtils.IntComparator() {
    @Override
    public int compare(int a, int b) {
    -            return tnums[a] - tnums[b];
    +            return (int)sorted[a] - (int)sorted[b];
    +          }
    +
    +          @Override
    +          public boolean lessThan(int a, int b) {
    +            return sorted[a] < sorted[b];
    +          }
    +
    +          @Override
    +          public boolean equals(int a, int b) {
    +            return sorted[a] == sorted[b];
    }
    });

    // convert the term numbers to term values and set as the label
    -        for (int i=0; i<tnumCount; i++) {
    +        for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
    int idx = indirect[i];
    -          int tnum = tnums[idx];
    -          String label = getReadableValue(getTermValue(te, tnum), ft,
    spare);
    -          res.setName(idx, label);
    +          int tnum = (int)sorted[idx];
    +          String label = getReadableValue(getTermValue(te, tnum), ft,
    spare);
    +          res.setName(idx - sortedIdxStart, label);
    }

    } else {

    Added:
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
    URL:
    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java?rev=992382&view=auto

    ==============================================================================
    ---
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
    (added)
    +++
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java
    Fri Sep  3 17:12:26 2010
    @@ -0,0 +1,235 @@
    +package org.apache.solr.util;
    +
    +import java.util.Arrays;
    +
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version
    2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
    implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +/** A native long priority queue.
    + *
    + * @lucene.internal
    +*/
    +public class LongPriorityQueue {
    +  protected int size;             // number of elements currently in the
    queue
    +  protected int currentCapacity;  // number of elements the queue can
    hold w/o expanding
    +  protected int maxSize;          // max number of elements allowed in
    the queue
    +  protected long[] heap;
    +  protected final long sentinel;   // represents a null return value
    +
    +  public LongPriorityQueue(int initialSize, int maxSize, long sentinel) {
    +    this.maxSize = maxSize;
    +    this.sentinel = sentinel;
    +    initialize(initialSize);
    +  }
    +
    +
    +  protected void initialize(int sz) {
    +    int heapSize;
    +    if (0 == sz)
    +      // We allocate 1 extra to avoid if statement in top()
    +      heapSize = 2;
    +    else {
    +      // NOTE: we add +1 because all access to heap is
    +      // 1-based not 0-based.  heap[0] is unused.
    +      heapSize = Math.max(sz, sz + 1); // handle overflow
    +    }
    +    heap = new long[heapSize];
    +    currentCapacity = sz;
    +  }
    +
    +  public int getCurrentCapacity() {
    +    return currentCapacity;
    +  }
    +
    +  public void resize(int sz) {
    +    int heapSize;
    +    if (sz > maxSize) {
    +      maxSize = sz;
    +    }
    +    if (0 == sz)
    +      // We allocate 1 extra to avoid if statement in top()
    +      heapSize = 2;
    +    else {
    +      heapSize = Math.max(sz, sz + 1); // handle overflow
    +    }
    +    heap = Arrays.copyOf(heap, heapSize);
    +    currentCapacity = sz;
    +  }
    +
    +  /**
    +   * Adds an object to a PriorityQueue in log(size) time. If one tries to
    add
    +   * more objects than maxSize from initialize an
    +   * {@link ArrayIndexOutOfBoundsException} is thrown.
    +   *
    +   * @return the new 'top' element in the queue.
    +   */
    +  public long add(long element) {
    +    if (size >= currentCapacity) {
    +      int newSize = Math.min(currentCapacity <<1, maxSize);
    +      if (newSize < currentCapacity) newSize = Integer.MAX_VALUE;  //
    handle overflow
    +      resize(newSize);
    +    }
    +    size++;
    +    heap[size] = element;
    +    upHeap();
    +    return heap[1];
    +  }
    +
    + /**
    +   * Adds an object to a PriorityQueue in log(size) time. If one tries to
    add
    +   * more objects than the current capacity, an
    +   * {@link ArrayIndexOutOfBoundsException} is thrown.
    +   */
    +  public void addNoCheck(long element) {
    +    ++size;
    +    heap[size] = element;
    +    upHeap();
    +  }
    +
    +  /**
    +   * Adds an object to a PriorityQueue in log(size) time.
    +   * It returns the smallest object (if any) that was
    +   * dropped off the heap because it was full, or
    +   * the sentinel value.
    +   *
    +   *  This can be
    +   * the given parameter (in case it is smaller than the
    +   * full heap's minimum, and couldn't be added), or another
    +   * object that was previously the smallest value in the
    +   * heap and now has been replaced by a larger one, or null
    +   * if the queue wasn't yet full with maxSize elements.
    +   */
    +  public long insertWithOverflow(long element) {
    +    if (size < maxSize) {
    +      add(element);
    +      return sentinel;
    +    } else if (element > heap[1]) {
    +      long ret = heap[1];
    +      heap[1] = element;
    +      updateTop();
    +      return ret;
    +    } else {
    +      return element;
    +    }
    +  }
    +
    +  /** inserts the element and returns true if this element caused another
    element
    +   * to be dropped from the queue. */
    +  public boolean insert(long element) {
    +    if (size < maxSize) {
    +      add(element);
    +      return false;
    +    } else if (element > heap[1]) {
    +      // long ret = heap[1];
    +      heap[1] = element;
    +      updateTop();
    +      return true;
    +    } else {
    +      return false;
    +    }
    +  }
    +
    +  /** Returns the least element of the PriorityQueue in constant time. */
    +  public long top() {
    +    return heap[1];
    +  }
    +
    +  /** Removes and returns the least element of the PriorityQueue in
    log(size)
    +    time.  Only valid if size() > 0.
    +   */
    +  public long pop() {
    +    long result = heap[1];               // save first value
    +    heap[1] = heap[size];                // move last to first
    +    size--;
    +    downHeap();                                  // adjust heap
    +    return result;
    +  }
    +
    +  /**
    +   * Should be called when the Object at top changes values.
    +   * @return the new 'top' element.
    +   */
    +  public long updateTop() {
    +    downHeap();
    +    return heap[1];
    +  }
    +
    +  /** Returns the number of elements currently stored in the
    PriorityQueue. */
    +  public int size() {
    +    return size;
    +  }
    +
    +  /** Returns the array used to hold the heap, with the smallest item at
    array[1]
    +   *  and the last (but not necessarily largest) at array[size()].  This
    is *not*
    +   *  fully sorted.
    +   */
    +  public long[] getInternalArray() {
    +    return heap;
    +  }
    +
    +  /** Pops the smallest n items from the heap, placing them in the
    internal array at
    +   *  arr[size] through arr[size-(n-1)] with the smallest (first element
    popped)
    +   *  being at arr[size].  The internal array is returned.
    +   */
    +  public long[] sort(int n) {
    +    while (--n >= 0) {
    +      long result = heap[1];             // save first value
    +      heap[1] = heap[size];              // move last to first
    +      heap[size] = result;                  // place it last
    +      size--;
    +      downHeap();                                // adjust heap
    +    }
    +    return heap;
    +  }
    +
    +  /** Removes all entries from the PriorityQueue. */
    +  public void clear() {
    +    size = 0;
    +  }
    +
    +  private void upHeap() {
    +    int i = size;
    +    long node = heap[i];                         // save bottom node
    +    int j = i >>> 1;
    +    while (j > 0 && node < heap[j]) {
    +      heap[i] = heap[j];                         // shift parents down
    +      i = j;
    +      j = j >>> 1;
    +    }
    +    heap[i] = node;                              // install saved node
    +  }
    +
    +  private void downHeap() {
    +    int i = 1;
    +    long node = heap[i];                         // save top node
    +    int j = i << 1;                              // find smaller child
    +    int k = j + 1;
    +    if (k <= size && heap[k] < heap[j]) {
    +      j = k;
    +    }
    +    while (j <= size && heap[j] < node) {
    +      heap[i] = heap[j];                         // shift up child
    +      i = j;
    +      j = i << 1;
    +      k = j + 1;
    +      if (k <= size && heap[k] < heap[j]) {
    +        j = k;
    +      }
    +    }
    +    heap[i] = node;                              // install saved node
    +  }
    +}

    Propchange:
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

    ------------------------------------------------------------------------------
    svn:eol-style = native

    Propchange:
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

    ------------------------------------------------------------------------------
    svn:executable = *

    Propchange:
    lucene/dev/trunk/solr/src/java/org/apache/solr/util/LongPriorityQueue.java

    ------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

    Modified:
    lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
    URL:
    http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java?rev=992382&r1=992381&r2=992382&view=diff

    ==============================================================================
    --- lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
    (original)
    +++ lucene/dev/trunk/solr/src/test/org/apache/solr/util/PrimUtilsTest.java
    Fri Sep  3 17:12:26 2010
    @@ -51,4 +51,44 @@ public class PrimUtilsTest extends Lucen
    }
    }

    +  public void testLongPriorityQueue() {
    +    int maxSize = 100;
    +    long[] a = new long[maxSize];
    +    long[] discards = new long[maxSize];
    +
    +    for (int iter=0; iter<100; iter++) {
    +      int discardCount = 0;
    +      int startSize = r.nextInt(maxSize) + 1;
    +      int endSize = startSize==maxSize ? maxSize : startSize +
    r.nextInt(maxSize-startSize);
    +      int adds = r.nextInt(maxSize+1);
    +      // System.out.println("startSize=" + startSize + " endSize=" +
    endSize + " adds="+adds);
    +      LongPriorityQueue pq = new LongPriorityQueue(startSize, endSize,
    Long.MIN_VALUE);
    +
    +      for (int i=0; i<adds; i++) {
    +        long v = r.nextLong();
    +        a[i] = v;
    +        long out = pq.insertWithOverflow(v);
    +        if (i < endSize) {
    +          assertEquals(out, Long.MIN_VALUE);
    +        } else {
    +          discards[discardCount++] = out;
    +        }
    +      }
    +      assertEquals(Math.min(adds,endSize), pq.size());
    +      assertEquals(adds, pq.size() + discardCount);
    +
    +      Arrays.sort(a, 0, adds);
    +      Arrays.sort(discards, 0, discardCount);
    +      for (int i=0; i<discardCount; i++) {
    +        assertEquals(a[i], discards[i]);
    +      }
    +
    +      for (int i=discardCount; i<adds; i++) {
    +        assertEquals(a[i], pq.pop());
    +      }
    +
    +      assertEquals(0, pq.size());
    +    }
    +  }
    +
    }
    \ No newline at end of file


    --
    Robert Muir
    rcmuir@gmail.com
    ---------------------------------------------------------------------
    To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
    For additional commands, e-mail: dev-help@lucene.apache.org
  • Yonik Seeley at Sep 3, 2010 at 6:37 pm

    On Fri, Sep 3, 2010 at 2:33 PM, Yonik Seeley wrote:
    On Fri, Sep 3, 2010 at 2:24 PM, Robert Muir wrote:
    Does SimpleFacetsTest fail for you?
    I svn up'ed and i see a few failures
    Ack - it does!  I'll fix.
    I'm wondering how that happened... I think I may have previously done a
    ant test -Dtestcase=TestSimpleFacets (notice the misspelling)
    Interesting... it succeeds when I run the test from intellij - that
    may be another reason why I missed it.
    But I'm sure I ran a full "ant test" after the last code changes (but
    that was last friday - then I went on vacation).

    -Yonik
    http://lucenerevolution.org  Lucene/Solr Conference, Boston Oct 7-8

    ---------------------------------------------------------------------
    To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
    For additional commands, e-mail: dev-help@lucene.apache.org
  • Robert Muir at Sep 3, 2010 at 6:38 pm

    On Fri, Sep 3, 2010 at 2:33 PM, Yonik Seeley wrote:
    Ack - it does! I'll fix.
    I'm wondering how that happened... I think I may have previously done a
    ant test -Dtestcase=TestSimpleFacets (notice the misspelling)
    seems likely, since i did the exact same thing trying to reproduce it!



    --
    Robert Muir
    rcmuir@gmail.com
  • Yonik Seeley at Sep 3, 2010 at 6:42 pm
    OK, should be fixed - it was an off-by-one in some assert statements
    (that apparently aren't turned on when I run tests from my IDE right
    now).

    -Yonik
    http://lucenerevolution.org Lucene/Solr Conference, Boston Oct 7-8

    On Fri, Sep 3, 2010 at 2:37 PM, Robert Muir wrote:

    On Fri, Sep 3, 2010 at 2:33 PM, Yonik Seeley wrote:

    Ack - it does!  I'll fix.
    I'm wondering how that happened... I think I may have previously done a
    ant test -Dtestcase=TestSimpleFacets (notice the misspelling)
    seems likely, since i did the exact same thing trying to reproduce it!


    --
    Robert Muir
    rcmuir@gmail.com
    ---------------------------------------------------------------------
    To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
    For additional commands, e-mail: dev-help@lucene.apache.org

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupdev @
categorieslucene
postedSep 3, '10 at 6:25p
activeSep 3, '10 at 6:42p
posts5
users2
websitelucene.apache.org

2 users in discussion

Yonik Seeley: 3 posts Robert Muir: 2 posts

People

Translate

site design / logo © 2021 Grokbase