Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java =================================================================== --- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 1385125) +++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy) @@ -431,13 +431,27 @@ System.out.println("TEST: saved to /x/tmp3/out.dot"); } - private static final Automaton SEP_A = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP, - TokenStreamToAutomaton.POS_SEP); + private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP); + private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE); - private static final String SEP = "" + (char) TokenStreamToAutomaton.POS_SEP; + private Automaton join(String ... strings) { + List as = new ArrayList(); + for(String s : strings) { + as.add(BasicAutomata.makeString(s)); + as.add(SEP_A); + } + as.remove(as.size()-1); + return BasicOperations.concatenate(as); + } - private static final String HOLE = "" + (char) TokenStreamToAutomaton.HOLE; + private Automaton join(Automaton ... as) { + return BasicOperations.concatenate(Arrays.asList(as)); + } + private Automaton s2a(String s) { + return BasicAutomata.makeString(s); + } + public void testTwoTokens() throws Exception { final TokenStream ts = new CannedTokenStream( @@ -446,7 +460,7 @@ token("def", 1, 1), }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); - final Automaton expected = BasicAutomata.makeString("abc" + SEP + "def"); + final Automaton expected = join("abc", "def"); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); @@ -461,7 +475,7 @@ }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); - final Automaton expected = BasicAutomata.makeString("abc" + SEP + HOLE + SEP + "def"); + final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); @@ -492,7 +506,7 @@ }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); - final Automaton a2 = BasicAutomata.makeString("abc" + SEP + "def"); + final Automaton a2 = join("abc", "def"); final Automaton expected = BasicOperations.union(a1, a2); //toDot(actual); @@ -509,10 +523,10 @@ }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); final Automaton a1 = BasicOperations.union( - BasicAutomata.makeString("a" + SEP + HOLE), + join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X")); final Automaton expected = BasicOperations.concatenate(a1, - BasicAutomata.makeString(SEP + "b")); + join(SEP_A, s2a("b"))); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } @@ -527,7 +541,7 @@ }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); final Automaton expected = BasicOperations.union( - BasicAutomata.makeString("xyz" + SEP + HOLE + SEP + "def"), + join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc")); assertTrue(BasicOperations.sameLanguage(expected, actual)); } @@ -543,7 +557,7 @@ }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); - final Automaton a2 = BasicAutomata.makeString("abc" + SEP + "def" + SEP + "ghi"); + final Automaton a2 = join("abc", "def", "ghi"); final Automaton expected = BasicOperations.union(a1, a2); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); @@ -562,7 +576,7 @@ token("abc", 2, 1), }); final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); - final Automaton expected = BasicAutomata.makeString(HOLE + SEP + "abc"); + final Automaton expected = join(HOLE_A, SEP_A, s2a("abc")); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1385125) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -836,7 +836,9 @@ public static BytesRef toBytesRef(IntsRef input, BytesRef scratch) { scratch.grow(input.length); for(int i=0;i= Byte.MIN_VALUE && value <= Byte.MAX_VALUE: "value " + value + " doesn't fit into byte"; + scratch.bytes[i] = (byte) value; } scratch.length = input.length; return scratch; Index: lucene/core/src/java/org/apache/lucene/util/automaton/State.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/automaton/State.java (revision 1385124) +++ lucene/core/src/java/org/apache/lucene/util/automaton/State.java (working copy) @@ -62,7 +62,7 @@ /** * Resets transition set. */ - final void resetTransitions() { + public final void resetTransitions() { transitionsArray = new Transition[0]; numTransitions = 0; } @@ -165,7 +165,11 @@ } } - void addEpsilon(State to) { + /** Virtually adds an epsilon transition to the target + * {@code to} state. This is implemented by copying all + * transitions from {@code to} to this state, and if {@code + * to} is an accept state then set accept for this state. */ + public void addEpsilon(State to) { if (to.accept) accept = true; for (Transition t : to.getTransitions()) addTransition(t); Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 1385125) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -64,14 +64,10 @@ } /** We create transition between two adjacent tokens. */ - // nocommit should we ues 256? ie, outside of the utf8 - // byte range... - public static final int POS_SEP = 0; + public static final int POS_SEP = 256; /** We add this arc to represent a hole. */ - // nocommit should we ues 257? ie, outside of the utf8 - // byte range... - public static final int HOLE = 1; + public static final int HOLE = 257; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 1385125) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (working copy) @@ -120,6 +120,62 @@ assertEquals("the ghost of christmas past", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); } + + public void testNoSeps() throws Exception { + TermFreq[] keys = new TermFreq[] { + new TermFreq("ab cd", 0), + new TermFreq("abcd", 1), + }; + + int options = 0; + + // Should have no bearing on this test: + if (random().nextBoolean()) { + options |= AnalyzingCompletionLookup.PRESERVE_HOLES; + } + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random()), options); + suggester.build(new TermFreqArrayIterator(keys)); + // nocommit if i change this to "ab " ... the test fails + // now but really it should pass??? problem is + // analyzers typically strip trailing space? really we + // need a SEP token appear instead...? + List r = suggester.lookup(_TestUtil.stringToCharSequence("ab c", random()), false, 2); + assertEquals(2, r.size()); + + // With no PRESERVE_SEPS specified, "ab c" should also + // complete to "abcd", which has higher weight so should + // appear first: + assertEquals("abcd", r.get(0).key.toString()); + } + + public void testNoHoles() throws Exception { + TermFreq[] keys = new TermFreq[] { + new TermFreq("ab cd", 0), + new TermFreq("abcd", 1), + }; + + int options = 0; + + // Should have no bearing on this test: + if (random().nextBoolean()) { + options |= AnalyzingCompletionLookup.PRESERVE_HOLES; + } + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random()), options); + suggester.build(new TermFreqArrayIterator(keys)); + // nocommit if i change this to "ab " ... the test fails + // now but really it should pass??? problem is + // analyzers typically strip trailing space? really we + // need a SEP token appear instead...? + List r = suggester.lookup(_TestUtil.stringToCharSequence("ab c", random()), false, 2); + assertEquals(2, r.size()); + + // With no PRESERVE_SEPS specified, "ab c" should also + // complete to "abcd", which has higher weight so should + // appear first: + assertEquals("abcd", r.get(0).key.toString()); + } public void testInputPathRequired() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -238,7 +294,10 @@ keys[i] = new TermFreq(s, weight); } - AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false), false); + // nocommit also test NOT preserving seps/holes + // nocommit why no failure if we DON'T preserve seps/holes...? + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false), + AnalyzingCompletionLookup.PRESERVE_SEPS | AnalyzingCompletionLookup.PRESERVE_HOLES); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (revision 1385125) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (working copy) @@ -48,16 +48,18 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PairOutputs; -import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.fst.Util.MinResult; /** * Suggester that first analyzes the surface form, adds the @@ -105,26 +107,117 @@ */ private final boolean exactFirst; + /** + * True if separator between tokens should be preservered. + */ + private final boolean preserveSep; + + /** + * True if holes (e.g. from stop word removal) should be preservered. + */ + private final boolean preserveHoles; + /** - * Calls {@link #AnalyzingCompletionLookup(Analyzer,boolean) AnalyzingCompletionLookup(analyzer, true)} + * Calls {@link #AnalyzingCompletionLookup(Analyzer,int) AnalyzingCompletionLookup(analyzer, EXACT_FIRST | PRESERVE_SEPS | PRESERVE_HOLES)} */ public AnalyzingCompletionLookup(Analyzer analyzer) { - this(analyzer, true); + this(analyzer, EXACT_FIRST | PRESERVE_SEPS | PRESERVE_HOLES); } - + + /** Include this flag in the options parameter {@link + * #AnalyzingCompletionLookup(Analyzer,int)} to always + * return exact matches first regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. */ + public static final int EXACT_FIRST = 1; + + /** Include this flag in the options parameter {@link + * #AnalyzingCompletionLookup(Analyzer,int)} to preserve + * token separators when matching. */ + public static final int PRESERVE_SEPS = 2; + + // nocommit: maybe nuke PRESERVE_HOLES? because you can + // always eg tell StopFilter to enablePosInc=false? if we + // don't nuke it then we need test case! + + /** Include this flag in the options parameter {@link + * #AnalyzingCompletionLookup(Analyzer,int)} to preserve + * holes when matching. */ + public static final int PRESERVE_HOLES = 4; + + /** Represents the separation between tokens, if + * PRESERVE_SEPS was specified */ + private static final byte SEP_BYTE = 0; + + /** Represents a hole (e.g. caused by StopFilter), if + * PRESERVE_HOLES was specified */ + private static final byte HOLE_BYTE = 1; + /** * Creates a new suggester. * * @param analyzer Analyzer that will be used for analyzing suggestions. - * @param exactFirst true if suggestions that match the + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEPS}, {@link #PRESERVE_HOLES} * prefix exactly should always be returned first, regardless * of score. This has no performance impact, but could result * in low-quality suggestions. */ - public AnalyzingCompletionLookup(Analyzer analyzer, boolean exactFirst) { + public AnalyzingCompletionLookup(Analyzer analyzer, int options) { this.analyzer = analyzer; - this.exactFirst = exactFirst; + if ((options & ~(EXACT_FIRST | PRESERVE_SEPS | PRESERVE_HOLES)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST, PRESERVE_SEPS and PRESERVE_HOLES; got " + options); + } + this.exactFirst = (options & EXACT_FIRST) != 0; + this.preserveSep = (options & PRESERVE_SEPS) != 0; + this.preserveHoles = (options & PRESERVE_HOLES) != 0; } + + // Replaces SEP and HOLE with epsilon, or remaps them if + // we were asked to preserve them: + private void replaceSepAndHoles(Automaton a) { + + State[] states = a.getNumberedStates(); + + // Go in reverse topo sort so we know we only have to + // make one pass: + for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) { + final State state = states[stateNumber]; + List newTransitions = new ArrayList(); + for(Transition t : state.getTransitions()) { + assert t.getMin() == t.getMax(); + if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { + if (preserveSep) { + // Remap to SEP_BYTE: + t = new Transition(SEP_BYTE, t.getDest()); + } else { + // NOTE: sort of weird because this will grow + // the transition array we are iterating over, + // but because we are going in reverse topo sort + // it will not add any SEP/HOLE transitions: + state.addEpsilon(t.getDest()); + t = null; + } + } else if (t.getMin() == TokenStreamToAutomaton.HOLE) { + if (preserveHoles) { + // Remap to HOLE_BYTE: + t = new Transition(HOLE_BYTE, t.getDest()); + } else { + // NOTE: sort of weird because this will grow + // the transition array we are iterating over, + // but because we are going in reverse topo sort + // it will not add any SEP/HOLE transitions: + state.addEpsilon(t.getDest()); + t = null; + } + } + if (t != null) { + newTransitions.add(t); + } + } + state.resetTransitions(); + state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()])); + } + } @Override public void build(TermFreqIterator iterator) throws IOException { @@ -137,8 +230,6 @@ Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - assert TokenStreamToAutomaton.POS_SEP < Byte.MAX_VALUE; - BytesRef separator = new BytesRef(new byte[] { (byte)TokenStreamToAutomaton.POS_SEP }); // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) @@ -158,6 +249,9 @@ Automaton automaton = TokenStreamToAutomaton.toAutomaton(ts); ts.end(); ts.close(); + + replaceSepAndHoles(automaton); + assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be @@ -166,7 +260,7 @@ // nocommit: we should probably not wire this param to -1 but have a reasonable limit?! Set paths = SpecialOperations.getFiniteStrings(automaton, -1); - for (IntsRef path : paths) { + for (IntsRef path : paths) { Util.toBytesRef(path, scratch); @@ -203,6 +297,7 @@ BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); ByteArrayDataInput input = new ByteArrayDataInput(); + int dedup = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); input.setPosition(input.length()-2); @@ -226,8 +321,21 @@ // increasing bytes (it wont matter) ... or we // could use multiple outputs for a single input? // this would be more efficient? - continue; + if (dedup < 256) { + analyzed.grow(analyzed.length+1); + analyzed.bytes[analyzed.length] = (byte) dedup; + dedup++; + analyzed.length++; + } else { + // nocommit add param to limit "dups"??? + + // More than 256 dups: skip the rest: + continue; + } + } else { + dedup = 0; } + Util.toIntsRef(analyzed, scratchInts); // nocommit (why?) builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); @@ -289,6 +397,8 @@ throw new RuntimeException(bogus); } + replaceSepAndHoles(automaton); + // TODO: we can optimize this somewhat by determinizing // while we convert automaton = Automaton.minimize(automaton); @@ -307,7 +417,11 @@ throw new RuntimeException(bogus); } - // nocommit maybe nuke exactFirst...? but... it's useful? + // nocommit maybe nuke exactFirst...? but... it's + // useful? + // nocommit: exactFirst is not well defined here ... the + // "exactness" refers to the analyzed form not the + // surface form if (exactFirst) { for (FSTUtil.Path> path : prefixPaths) { if (path.fstNode.isFinal()) {