372 lines
17 KiB
C#
372 lines
17 KiB
C#
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using NUnit.Framework;
|
|
|
|
using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
|
|
using WhitespaceAnalyzer = Lucene.Net.Analysis.WhitespaceAnalyzer;
|
|
using Document = Lucene.Net.Documents.Document;
|
|
using Field = Lucene.Net.Documents.Field;
|
|
using IndexWriter = Lucene.Net.Index.IndexWriter;
|
|
using IndexReader = Lucene.Net.Index.IndexReader;
|
|
using Term = Lucene.Net.Index.Term;
|
|
using RAMDirectory = Lucene.Net.Store.RAMDirectory;
|
|
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
|
|
using Directory = Lucene.Net.Store.Directory;
|
|
using MockRAMDirectory = Lucene.Net.Store.MockRAMDirectory;
|
|
using QueryParser = Lucene.Net.QueryParsers.QueryParser;
|
|
|
|
namespace Lucene.Net.Search
|
|
{
|
|
|
|
/// <summary> Tests {@link FuzzyQuery}.
|
|
///
|
|
/// </summary>
|
|
[TestFixture]
|
|
public class TestFuzzyQuery:LuceneTestCase
|
|
{
|
|
|
|
[Test]
|
|
public virtual void TestFuzziness()
|
|
{
|
|
RAMDirectory directory = new RAMDirectory();
|
|
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true,
|
|
IndexWriter.MaxFieldLength.LIMITED);
|
|
AddDoc("aaaaa", writer);
|
|
AddDoc("aaaab", writer);
|
|
AddDoc("aaabb", writer);
|
|
AddDoc("aabbb", writer);
|
|
AddDoc("abbbb", writer);
|
|
AddDoc("bbbbb", writer);
|
|
AddDoc("ddddd", writer);
|
|
writer.Optimize();
|
|
writer.Close();
|
|
IndexSearcher searcher = new IndexSearcher(directory, true);
|
|
|
|
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
|
|
// same with prefix
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(2, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
|
|
// test scoring
|
|
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length, "3 documents should match");
|
|
List<String> order = new List<string>(new[] {"bbbbb", "abbbb", "aabbb"});
|
|
for (int i = 0; i < hits.Length; i++)
|
|
{
|
|
String term = searcher.Doc(hits[i].Doc).Get("field");
|
|
//System.out.println(hits[i].score);
|
|
Assert.AreEqual(order[i], term);
|
|
}
|
|
|
|
// test BooleanQuery.maxClauseCount
|
|
int savedClauseCount = BooleanQuery.MaxClauseCount;
|
|
try
|
|
{
|
|
BooleanQuery.MaxClauseCount = 2;
|
|
// This query would normally return 3 documents, because 3 terms match (see above):
|
|
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(2, hits.Length, "only 2 documents should match");
|
|
order = new List<string>(new[] {"bbbbb", "abbbb"});
|
|
for (int i = 0; i < hits.Length; i++)
|
|
{
|
|
String term = searcher.Doc(hits[i].Doc).Get("field");
|
|
//System.out.println(hits[i].score);
|
|
Assert.AreEqual(order[i], term);
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
BooleanQuery.MaxClauseCount = savedClauseCount;
|
|
}
|
|
|
|
// not similar enough:
|
|
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
// edit distance to "aaaaa" = 3
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// query identical to a word in the index:
|
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
// default allows for up to two edits:
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
|
|
|
|
// query similar to a word in the index:
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
|
|
|
|
// now with prefix
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(3, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(2, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
|
|
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
|
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
|
|
|
|
// now with prefix
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
|
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
|
|
// different field = no match:
|
|
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
searcher.Close();
|
|
directory.Close();
|
|
}
|
|
|
|
[Test]
|
|
public virtual void TestFuzzinessLong()
|
|
{
|
|
RAMDirectory directory = new RAMDirectory();
|
|
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
|
|
AddDoc("aaaaaaa", writer);
|
|
AddDoc("segment", writer);
|
|
writer.Optimize();
|
|
writer.Close();
|
|
IndexSearcher searcher = new IndexSearcher(directory, true);
|
|
|
|
FuzzyQuery query;
|
|
// not similar enough:
|
|
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
|
|
// in testDefaultFuzziness so a bigger difference is allowed:
|
|
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
|
|
|
|
// now with prefix
|
|
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
|
|
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
|
|
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// no match, more than half of the characters is wrong:
|
|
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// now with prefix
|
|
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// "student" and "stellent" are indeed similar to "segment" by default:
|
|
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
|
|
// now with prefix
|
|
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// "student" doesn't match anymore thanks to increased minimum similarity:
|
|
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
Assert.Throws<ArgumentException>(() => new FuzzyQuery(new Term("field", "student"), 1.1f),
|
|
"Expected ArgumentException");
|
|
Assert.Throws<ArgumentException>(() => new FuzzyQuery(new Term("field", "student"), -0.1f),
|
|
"Expected ArgumentException");
|
|
|
|
searcher.Close();
|
|
directory.Close();
|
|
}
|
|
|
|
[Test]
|
|
public virtual void TestTokenLengthOpt()
|
|
{
|
|
RAMDirectory directory = new RAMDirectory();
|
|
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
|
|
AddDoc("12345678911", writer);
|
|
AddDoc("segment", writer);
|
|
writer.Optimize();
|
|
writer.Close();
|
|
IndexSearcher searcher = new IndexSearcher(directory, true);
|
|
|
|
Query query;
|
|
// term not over 10 chars, so optimization shortcuts
|
|
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
|
|
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// 10 chars, so no optimization
|
|
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
|
|
// over 10 chars, so no optimization
|
|
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
|
|
// over 10 chars, no match
|
|
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
|
|
hits = searcher.Search(query, null, 1000).ScoreDocs;
|
|
Assert.AreEqual(0, hits.Length);
|
|
}
|
|
|
|
[Test]
|
|
public virtual void TestGiga()
|
|
{
|
|
|
|
StandardAnalyzer analyzer = new StandardAnalyzer(Util.Version.LUCENE_CURRENT);
|
|
|
|
Directory index = new MockRAMDirectory();
|
|
IndexWriter w = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
|
|
|
|
AddDoc("Lucene in Action", w);
|
|
AddDoc("Lucene for Dummies", w);
|
|
|
|
// addDoc("Giga", w);
|
|
AddDoc("Giga byte", w);
|
|
|
|
AddDoc("ManagingGigabytesManagingGigabyte", w);
|
|
AddDoc("ManagingGigabytesManagingGigabytes", w);
|
|
|
|
AddDoc("The Art of Computer Science", w);
|
|
AddDoc("J. K. Rowling", w);
|
|
AddDoc("JK Rowling", w);
|
|
AddDoc("Joanne K Roling", w);
|
|
AddDoc("Bruce Willis", w);
|
|
AddDoc("Willis bruce", w);
|
|
AddDoc("Brute willis", w);
|
|
AddDoc("B. willis", w);
|
|
IndexReader r = w.GetReader();
|
|
w.Close();
|
|
|
|
Query q = new QueryParser(Util.Version.LUCENE_CURRENT, "field", analyzer).Parse("giga~0.9");
|
|
|
|
// 3. search
|
|
IndexSearcher searcher = new IndexSearcher(r);
|
|
ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs;
|
|
Assert.AreEqual(1, hits.Length);
|
|
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), "Giga byte");
|
|
r.Close();
|
|
}
|
|
|
|
private void AddDoc(System.String text, IndexWriter writer)
|
|
{
|
|
Document doc = new Document();
|
|
doc.Add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED));
|
|
writer.AddDocument(doc);
|
|
}
|
|
}
|
|
} |