Jo Shields a575963da9 Imported Upstream version 3.6.0
Former-commit-id: da6be194a6b1221998fc28233f2503bd61dd9d14
2014-08-13 10:39:27 +01:00

234 lines
9.1 KiB
C#

/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Query;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Test.Analysis;
using NUnit.Framework;
using Version=Lucene.Net.Util.Version;
namespace Lucene.Net.Analyzers.Query
{
[TestFixture]
public class QueryAutoStopWordAnalyzerTest : BaseTokenStreamTestCase
{
String[] variedFieldValues = { "the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog" };
String[] repetitiveFieldValues = { "boring", "boring", "vaguelyboring" };
RAMDirectory dir;
Analyzer appAnalyzer;
IndexReader reader;
QueryAutoStopWordAnalyzer protectedAnalyzer;
public override void SetUp()
{
dir = new RAMDirectory();
appAnalyzer = new WhitespaceAnalyzer();
IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
int numDocs = 200;
for (int i = 0; i < numDocs; i++)
{
Document doc = new Document();
String variedFieldValue = variedFieldValues[i % variedFieldValues.Length];
String repetitiveFieldValue = repetitiveFieldValues[i % repetitiveFieldValues.Length];
doc.Add(new Field("variedField", variedFieldValue, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("repetitiveField", repetitiveFieldValue, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
writer.Close();
reader = IndexReader.Open(dir, true);
protectedAnalyzer = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, appAnalyzer);
base.SetUp();
}
public override void TearDown()
{
reader.Close();
base.TearDown();
}
//Helper method to query
private int Search(Analyzer a, String queryString)
{
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "repetitiveField", a);
var q = qp.Parse(queryString);
return new IndexSearcher(reader).Search(q, null, 1000).TotalHits;
}
[Test]
public void TestUninitializedAnalyzer()
{
//Note: no calls to "addStopWord"
String query = "variedField:quick repetitiveField:boring";
int numHits1 = Search(protectedAnalyzer, query);
int numHits2 = Search(appAnalyzer, query);
Assert.AreEqual(numHits1, numHits2, "No filtering test");
}
/*
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.AddStopWords(IndexReader)'
*/
[Test]
public void TestDefaultAddStopWordsIndexReader()
{
protectedAnalyzer.AddStopWords(reader);
int numHits = Search(protectedAnalyzer, "repetitiveField:boring");
Assert.AreEqual(0, numHits, "Default filter should remove all docs");
}
/*
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.AddStopWords(IndexReader, int)'
*/
[Test]
public void TestAddStopWordsIndexReaderInt()
{
protectedAnalyzer.AddStopWords(reader, 1f / 2f);
int numHits = Search(protectedAnalyzer, "repetitiveField:boring");
Assert.AreEqual(0, numHits, "A filter on terms in > one half of docs remove boring docs");
numHits = Search(protectedAnalyzer, "repetitiveField:vaguelyboring");
Assert.True(numHits > 1, "A filter on terms in > half of docs should not remove vaguelyBoring docs");
protectedAnalyzer.AddStopWords(reader, 1f / 4f);
numHits = Search(protectedAnalyzer, "repetitiveField:vaguelyboring");
Assert.AreEqual(0, numHits, "A filter on terms in > quarter of docs should remove vaguelyBoring docs");
}
[Test]
public void TestAddStopWordsIndexReaderStringFloat()
{
protectedAnalyzer.AddStopWords(reader, "variedField", 1f / 2f);
int numHits = Search(protectedAnalyzer, "repetitiveField:boring");
Assert.True(numHits > 0, "A filter on one Field should not affect queris on another");
protectedAnalyzer.AddStopWords(reader, "repetitiveField", 1f / 2f);
numHits = Search(protectedAnalyzer, "repetitiveField:boring");
Assert.AreEqual(numHits, 0, "A filter on the right Field should affect queries on it");
}
[Test]
public void TestAddStopWordsIndexReaderStringInt()
{
int numStopWords = protectedAnalyzer.AddStopWords(reader, "repetitiveField", 10);
Assert.True(numStopWords > 0, "Should have identified stop words");
Term[] t = protectedAnalyzer.GetStopWords();
Assert.AreEqual(t.Length, numStopWords, "num terms should = num stopwords returned");
int numNewStopWords = protectedAnalyzer.AddStopWords(reader, "variedField", 10);
Assert.True(numNewStopWords > 0, "Should have identified more stop words");
t = protectedAnalyzer.GetStopWords();
Assert.AreEqual(t.Length, numStopWords + numNewStopWords, "num terms should = num stopwords returned");
}
[Test]
public void TestNoFieldNamePollution()
{
protectedAnalyzer.AddStopWords(reader, "repetitiveField", 10);
int numHits = Search(protectedAnalyzer, "repetitiveField:boring");
Assert.AreEqual(0, numHits, "Check filter set up OK");
numHits = Search(protectedAnalyzer, "variedField:boring");
Assert.True(numHits > 0, "Filter should not prevent stopwords in one field being used in another ");
}
/*
* subclass that acts just like whitespace analyzer for testing
*/
private class QueryAutoStopWordSubclassAnalyzer : QueryAutoStopWordAnalyzer
{
public QueryAutoStopWordSubclassAnalyzer(Version matchVersion)
: base(matchVersion, new WhitespaceAnalyzer())
{
}
public override TokenStream TokenStream(String fieldName, TextReader reader)
{
return new WhitespaceTokenizer(reader);
}
}
[Test]
public void TestLucene1678BwComp()
{
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer(Version.LUCENE_CURRENT);
a.AddStopWords(reader, "repetitiveField", 10);
int numHits = Search(a, "repetitiveField:boring");
Assert.False(numHits == 0);
}
/*
* analyzer that does not support reuse
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
*/
private class NonreusableAnalyzer : Analyzer
{
int invocationCount = 0;
public override TokenStream TokenStream(String fieldName, TextReader reader)
{
if (++invocationCount % 2 == 0)
return new WhitespaceTokenizer(reader);
else
return new LetterTokenizer(reader);
}
}
[Test]
public void TestWrappingNonReusableAnalyzer()
{
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new NonreusableAnalyzer());
a.AddStopWords(reader, 10);
int numHits = Search(a, "repetitiveField:boring");
Assert.True(numHits == 0);
numHits = Search(a, "repetitiveField:vaguelyboring");
Assert.True(numHits == 0);
}
[Test]
public void TestTokenStream()
{
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer());
a.AddStopWords(reader, 10);
TokenStream ts = a.TokenStream("repetitiveField", new StringReader("this boring"));
ITermAttribute termAtt = ts.GetAttribute<ITermAttribute>();
Assert.True(ts.IncrementToken());
Assert.AreEqual("this", termAtt.Term);
Assert.False(ts.IncrementToken());
}
}
}