Jo Shields a575963da9 Imported Upstream version 3.6.0
Former-commit-id: da6be194a6b1221998fc28233f2503bd61dd9d14
2014-08-13 10:39:27 +01:00

301 lines
12 KiB

* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
using System;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Shingle;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Test.Analysis;
using NUnit.Framework;
using Directory = Lucene.Net.Store.Directory;
namespace Lucene.Net.Analyzers.Shingle
/// <summary>
/// A test class for ShingleAnalyzerWrapper as regards queries and scoring.
/// </summary>
public class ShingleAnalyzerWrapperTest : BaseTokenStreamTestCase
public IndexSearcher Searcher;
/// <summary>
/// Set up a new index in RAM with three test phrases and the supplied Analyzer.
/// </summary>
/// <param name="analyzer">the analyzer to use</param>
/// <returns>an indexSearcher on the test index.</returns>
public IndexSearcher SetUpSearcher(Analyzer analyzer)
Directory dir = new RAMDirectory();
var writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
var doc = new Document();
doc.Add(new Field("content", "please divide this sentence into shingles",
Field.Store.YES, Field.Index.ANALYZED));
doc = new Document();
doc.Add(new Field("content", "just another test sentence",
Field.Store.YES, Field.Index.ANALYZED));
doc = new Document();
doc.Add(new Field("content", "a sentence which contains no test",
Field.Store.YES, Field.Index.ANALYZED));
return new IndexSearcher(dir, true);
protected ScoreDoc[] QueryParsingTest(Analyzer analyzer, String qs)
Searcher = SetUpSearcher(analyzer);
var qp = new QueryParser(Util.Version.LUCENE_CURRENT, "content", analyzer);
var q = qp.Parse(qs);
return Searcher.Search(q, null, 1000).ScoreDocs;
protected void CompareRanks(ScoreDoc[] hits, int[] ranks)
Assert.AreEqual(ranks.Length, hits.Length);
for (int i = 0; i < ranks.Length; i++)
Assert.AreEqual(ranks[i], hits[i].Doc);
/// <summary>
/// Will not work on an index without unigrams, since QueryParser automatically tokenizes on whitespace.
/// </summary>
public void TestShingleAnalyzerWrapperQueryParsing()
var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "test sentence");
var ranks = new[] {1, 2, 0};
CompareRanks(hits, ranks);
/// <summary>
/// This one fails with an exception.
/// </summary>
public void TestShingleAnalyzerWrapperPhraseQueryParsingFails()
var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"this sentence\"");
var ranks = new[] {0};
CompareRanks(hits, ranks);
/// <summary>
/// This one works, actually.
/// </summary>
public void TestShingleAnalyzerWrapperPhraseQueryParsing()
var hits = QueryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"\"test sentence\"");
var ranks = new[] {1};
CompareRanks(hits, ranks);
/// <summary>
/// Same as above, is tokenized without using the analyzer.
/// </summary>
public void TestShingleAnalyzerWrapperRequiredQueryParsing()
var hits = QueryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"+test +sentence");
var ranks = new[] {1, 2};
CompareRanks(hits, ranks);
/// <summary>
/// This shows how to construct a phrase query containing shingles.
/// </summary>
public void TestShingleAnalyzerWrapperPhraseQuery()
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
Searcher = SetUpSearcher(analyzer);
var q = new PhraseQuery();
var ts = analyzer.TokenStream("content", new StringReader("this sentence"));
var j = -1;
var posIncrAtt = ts.AddAttribute<IPositionIncrementAttribute>();
var termAtt = ts.AddAttribute<ITermAttribute>();
while (ts.IncrementToken())
j += posIncrAtt.PositionIncrement;
var termText = termAtt.Term;
q.Add(new Term("content", termText), j);
var hits = Searcher.Search(q, null, 1000).ScoreDocs;
var ranks = new[] {0};
CompareRanks(hits, ranks);
/// <summary>
/// How to construct a boolean query with shingles. A query like this will
/// implicitly score those documents higher that contain the words in the query
/// in the right order and adjacent to each other.
/// </summary>
public void TestShingleAnalyzerWrapperBooleanQuery()
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
Searcher = SetUpSearcher(analyzer);
var q = new BooleanQuery();
var ts = analyzer.TokenStream("content", new StringReader("test sentence"));
var termAtt = ts.AddAttribute<ITermAttribute>();
while (ts.IncrementToken())
var termText = termAtt.Term;
q.Add(new TermQuery(new Term("content", termText)),
var hits = Searcher.Search(q, null, 1000).ScoreDocs;
var ranks = new[] {1, 2, 0};
CompareRanks(hits, ranks);
public void TestReusableTokenStream()
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
AssertAnalyzesToReuse(a, "please divide into shingles",
"please", "please divide", "divide", "divide into", "into", "into shingles",
new[] {0, 0, 7, 7, 14, 14, 19},
new[] {6, 13, 13, 18, 18, 27, 27},
new[] {1, 0, 1, 0, 1, 0, 1});
AssertAnalyzesToReuse(a, "divide me up again",
new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"},
new[] {0, 0, 7, 7, 10, 10, 13},
new[] {6, 9, 9, 12, 12, 18, 18},
new[] {1, 0, 1, 0, 1, 0, 1});
/// <summary>
/// subclass that acts just like whitespace analyzer for testing
/// </summary>
public void TestLucene1678BwComp()
Analyzer a = new ShingleWrapperSubclassAnalyzer();
AssertAnalyzesToReuse(a, "this is a test",
new[] { "this", "is", "a", "test" },
new[] { 0, 5, 8, 10 },
new[] { 4, 7, 9, 14 });
#region Nested type: NonreusableAnalyzer
private class NonreusableAnalyzer : Analyzer
private int _invocationCount;
public override TokenStream TokenStream(String fieldName, TextReader reader)
if (++_invocationCount%2 == 0)
return new WhitespaceTokenizer(reader);
return new LetterTokenizer(reader);
#region Nested type: ShingleWrapperSubclassAnalyzer
private class ShingleWrapperSubclassAnalyzer : ShingleAnalyzerWrapper
public ShingleWrapperSubclassAnalyzer()
: base(Util.Version.LUCENE_CURRENT)
public override TokenStream TokenStream(String fieldName, TextReader reader)
return new WhitespaceTokenizer(reader);
} ;
/// <summary>
/// analyzer that does not support reuse it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
/// </summary>
public void TestWrappedAnalyzerDoesNotReuse()
Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
AssertAnalyzesToReuse(a, "please divide into shingles.",
"please", "please divide", "divide", "divide into", "into", "into shingles",
new[] { 0, 0, 7, 7, 14, 14, 19 },
new[] { 6, 13, 13, 18, 18, 27, 27 },
new[] { 1, 0, 1, 0, 1, 0, 1 });
AssertAnalyzesToReuse(a, "please divide into shingles.",
"please", "please divide", "divide", "divide into", "into", "into shingles.",
new[] { 0, 0, 7, 7, 14, 14, 19 },
new[] { 6, 13, 13, 18, 18, 28, 28 },
new[] { 1, 0, 1, 0, 1, 0, 1 });
AssertAnalyzesToReuse(a, "please divide into shingles.",
"please", "please divide", "divide", "divide into", "into", "into shingles",
new[] { 0, 0, 7, 7, 14, 14, 19 },
new[] { 6, 13, 13, 18, 18, 27, 27 },
new[] { 1, 0, 1, 0, 1, 0, 1 });