Friday, May 18, 2007

Lucene sorting Strings

Lucene out of the box stores the whole String for each entry (or term) if you make it sort by a String column. In most cases you don't need that precision so if you for instance only want to sort by uppercase characters and can live with sorting only by the first six of them youc an cut down your memory consumption dramatically.

All you have to do is write some custom comparator and some custom SortComparator with the right source, too. Here is the code:


















001 package com.sony.soe.platform.players.util;

002

003 import java.io.IOException;

004 import java.util.Comparator;

005

006 import org.apache.lucene.index.IndexReader;

007 import org.apache.lucene.index.Term;

008 import org.apache.lucene.index.TermDocs;

009 import org.apache.lucene.index.TermEnum;

010 import org.apache.lucene.search.ScoreDoc;

011 import org.apache.lucene.search.ScoreDocComparator;

012 import org.apache.lucene.search.SortComparatorSource;

013 import org.apache.lucene.search.SortField;

014

015 import com.sony.soe.platform.players.exception.PlayersException;

016

017 /**

018 * We have some special requirements regarding Strings so treat them special;-)

019 * @author geichberger

020 *

021 */

022 public class CustomComparator implements Comparator, SortComparatorSource {

023

024 public int compare(Object value1, Object value2) {

025 return staticCompare(value1, value2);

026 }

027

028 public static int staticCompare(Object value1, Object value2) {

029 if(value1 == null && value2 == null){

030 return 0;

031 }

032 //null is a smaller value than Not Null

033 if(value1 == null)

034 return -1;

035 if(value2 == null)

036 return 1;

037

038 if ((value1 instanceof String) && (value2 instanceof String)) {

039 String s1 = ((String)value1).replaceAll("[\\W[_]]", ""); //remove all funny characters

040 String s2 = ((String)value2).replaceAll("[\\W[_]]", ""); //remove all funny characters

041 return s1.compareToIgnoreCase(s2);

042 } else if (value1 instanceof Comparable) {

043 return ((Comparable) value1).compareTo((Comparable) value2);

044 }

045 return 0;

046 }

047

048 public ScoreDocComparator newComparator(IndexReader arg0, String arg1) throws IOException {

049 return new FirstStageStringDocComparator(arg0, arg1);

050 }

051

052 public static class FirstStageStringDocComparator implements ScoreDocComparator {

053 private int scores[];

054

055 public FirstStageStringDocComparator() {

056

057 }

058

059 public FirstStageStringDocComparator(IndexReader reader, String fieldname) throws IOException, PlayersException {

060 final TermEnum enumerator = reader.terms(new Term(fieldname, ""));

061 scores = new int[reader.maxDoc()];

062 if (scores.length>0) {

063 TermDocs termDocs = reader.termDocs();

064 try {

065 if (enumerator.term() == null ) {

066 throw new PlayersException();

067 }

068 do {

069 Term term = enumerator.term();

070 if (!term.field().equalsIgnoreCase(fieldname)) break;

071 termDocs.seek(enumerator);

072 while (termDocs.next()) {

073 String s = term.text().replaceAll("[\\W[_[\\d]]]", "").toUpperCase(); //work our magic (also ignore numbers)

074

075 int score = 0;

076 for (int i=0; i<6 && i<s.length(); i++) {

077 //the score is the value of the first three characters base 26;-)

078 //max score 26^3=17,576

079 score += (s.charAt(i)-65)*Math.pow(26, 5-i); //this is some shortcut because we ignore numbers...

080 }

081 scores[termDocs.doc()] = score;

082 }

083 } while (enumerator.next());

084 } finally {

085 termDocs.close();

086 }

087 }

088 }

089

090 public int compare(ScoreDoc i, ScoreDoc j) {

091 if (scores[i.doc] < scores[j.doc]) return -1;

092 if (scores[i.doc] > scores[j.doc]) return 1;

093 return 0;

094 }

095

096 public int sortType() {

097 return SortField.INT;

098 }

099

100 public Comparable sortValue(ScoreDoc i) {

101 return new Integer(scores[i.doc]);

102 }

103

104 public int[] getScores() {

105 return scores;

106 }

107

108 public void setScores(int[] scores) {

109 this.scores = scores;

110 }

111

112

113

114 }

115

116 }




Java2html




Then to use it all you have to do is use the following Sortfield in your query:new SortField(indexPart, new CustomComparator(), reverse)

UPDATE: You will also need to add hashcode and equals to the CustomComperator class - otherwise you will create a memory leak.

1 Comments:

Blogger German said...

I had the pleasure to talk to the Lucene guys at ApacheCon and they promised me to add some way of carrying through the score they give during indexing so it can be used for sorting.

This wasn't really big in their agenda because they mostly are interested in sorting by relevance.

6:05 PM

 

Post a Comment

<< Home