474 lines
15 KiB
474 lines
15 KiB
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Search;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using Lucene.Net.Store;
using Lucene.Net.Index;
using Lucene.Net.Util;
using NUnit.Framework;
namespace Lucene.Net.Search.Vectorhighlight
public abstract class AbstractTestCase
protected String F = "f";
protected String F1 = "f1";
protected String F2 = "f2";
protected Directory dir;
protected Analyzer analyzerW;
protected Analyzer analyzerB;
protected Analyzer analyzerK;
protected IndexReader reader;
protected QueryParser paW;
protected QueryParser paB;
protected static String[] shortMVValues = {
"a b c",
"", // empty data in multi valued field
"d e"
protected static String[] longMVValues = {
"Followings are the examples of customizable parameters and actual examples of customization:",
"The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
// test data for LUCENE-1448 bug
protected static String[] biMVValues = {
"\nLucene/Solr does not require such additional hardware.",
"\nWhen you talk about processing speed, the"
protected static String[] strMVValues = {
public void SetUp()
analyzerW = new WhitespaceAnalyzer();
analyzerB = new BigramAnalyzer();
analyzerK = new KeywordAnalyzer();
paW = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerW);
paB = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerB);
dir = new RAMDirectory();
public void TearDown()
if (reader != null)
reader = null;
protected Query Tq(String text)
return Tq(1F, text);
protected Query Tq(float boost, String text)
return Tq(boost, F, text);
protected Query Tq(String field, String text)
return Tq(1F, field, text);
protected Query Tq(float boost, String field, String text)
Query query = new TermQuery(new Term(field, text));
query.Boost = boost;
return query;
protected Query Preq(String text)
return Preq(1F, text);
protected Query Preq(float boost, String text)
return Preq(boost, F, text);
protected Query Preq(String field, String text)
return Preq(1F, field, text);
protected Query Preq(float boost, String field, String text)
Query query = new PrefixQuery(new Term(field, text));
query.Boost = boost;
return query;
protected Query PqF(params String[] texts)
return PqF(1F, texts);
//protected Query pqF(String[] texts)
// return pqF(1F, texts);
protected Query PqF(float boost, params String[] texts)
return pqF(boost, 0, texts);
protected Query pqF(float boost, int slop, params String[] texts)
return Pq(boost, slop, F, texts);
protected Query Pq(String field, params String[] texts)
return Pq(1F, 0, field, texts);
protected Query Pq(float boost, String field, params String[] texts)
return Pq(boost, 0, field, texts);
protected Query Pq(float boost, int slop, String field, params String[] texts)
PhraseQuery query = new PhraseQuery();
foreach (String text in texts)
query.Add(new Term(field, text));
query.Boost = boost;
query.Slop = slop;
return query;
protected Query Dmq(params Query[] queries)
return Dmq(0.0F, queries);
protected Query Dmq(float tieBreakerMultiplier, params Query[] queries)
DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier);
foreach (Query q in queries)
return query;
protected void AssertCollectionQueries(Dictionary<Query, Query> actual, params Query[] expected)
Assert.AreEqual(expected.Length, actual.Count);
foreach (Query query in expected)
class BigramAnalyzer : Analyzer
public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
return new BasicNGramTokenizer(reader);
class BasicNGramTokenizer : Tokenizer
public static int DEFAULT_N_SIZE = 2;
public static String DEFAULT_DELIMITERS = " \t\n.,";
private int n;
private String delimiters;
private int startTerm;
private int lenTerm;
private int startOffset;
private int nextStartOffset;
private int ch;
private String snippet;
private StringBuilder snippetBuffer;
private static int BUFFER_SIZE = 4096;
private char[] charBuffer;
private int charBufferIndex;
private int charBufferLen;
public BasicNGramTokenizer(System.IO.TextReader inReader): this(inReader, DEFAULT_N_SIZE)
public BasicNGramTokenizer(System.IO.TextReader inReader, int n): this(inReader, n, DEFAULT_DELIMITERS)
public BasicNGramTokenizer(System.IO.TextReader inReader, String delimiters) : this(inReader, DEFAULT_N_SIZE, delimiters)
public BasicNGramTokenizer(System.IO.TextReader inReader, int n, String delimiters) : base(inReader)
this.n = n;
this.delimiters = delimiters;
startTerm = 0;
nextStartOffset = 0;
snippet = null;
snippetBuffer = new StringBuilder();
charBuffer = new char[BUFFER_SIZE];
charBufferIndex = BUFFER_SIZE;
charBufferLen = 0;
ch = 0;
void Init()
termAtt = AddAttribute<ITermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
ITermAttribute termAtt = null;
IOffsetAttribute offsetAtt = null;
public override bool IncrementToken()
if (!GetNextPartialSnippet())
return false;
termAtt.SetTermBuffer(snippet, startTerm, lenTerm);
offsetAtt.SetOffset(CorrectOffset(startOffset), CorrectOffset(startOffset + lenTerm));
return true;
private int GetFinalOffset()
return nextStartOffset;
public override void End()
offsetAtt.SetOffset(GetFinalOffset(), GetFinalOffset());
protected bool GetNextPartialSnippet()
if (snippet != null && snippet.Length >= startTerm + 1 + n)
lenTerm = n;
return true;
return GetNextSnippet();
protected bool GetNextSnippet()
startTerm = 0;
startOffset = nextStartOffset;
snippetBuffer.Remove(0, snippetBuffer.Length);
while (true)
if (ch != -1)
ch = ReadCharFromBuffer();
if (ch == -1) break;
else if (!IsDelimiter(ch))
else if (snippetBuffer.Length > 0)
if (snippetBuffer.Length == 0)
return false;
snippet = snippetBuffer.ToString();
lenTerm = snippet.Length >= n ? n : snippet.Length;
return true;
protected int ReadCharFromBuffer()
if (charBufferIndex >= charBufferLen)
charBufferLen = input.Read(charBuffer,0,charBuffer.Length);
if (charBufferLen <= 0)
return -1;
charBufferIndex = 0;
int c = (int)charBuffer[charBufferIndex++];
return c;
protected bool IsDelimiter(int c)
return delimiters.IndexOf(Convert.ToChar(c) ) >= 0;
protected void Make1d1fIndex(String value)
Make1dmfIndex( value );
protected void Make1d1fIndexB(String value)
Make1dmfIndexB( value );
protected void Make1dmfIndex(params String[] values)
Make1dmfIndex(analyzerW, values);
protected void Make1dmfIndexB(params String[] values)
Make1dmfIndex(analyzerB, values);
// make 1 doc with multi valued field
protected void Make1dmfIndex(Analyzer analyzer, params String[] values)
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
foreach (String value in values)
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
reader = IndexReader.Open(dir,true);
// make 1 doc with multi valued & not analyzed field
protected void Make1dmfIndexNA(String[] values)
IndexWriter writer = new IndexWriter(dir, analyzerK, true, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
foreach (String value in values)
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
reader = IndexReader.Open(dir, true);
protected void MakeIndexShortMV()
// 012345
// "a b c"
// 0 1 2
// ""
// 6789
// "d e"
// 3 4
protected void MakeIndexLongMV()
// 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
// 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
// Followings are the examples of customizable parameters and actual examples of customization:
// 0 1 2 3 4 5 6 7 8 9 10 11
// 1 2
// 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
// 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
// The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
// 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
protected void MakeIndexLongMVB()
// "*" [] LF
// 1111111111222222222233333333334444444444555555
// 01234567890123456789012345678901234567890123456789012345
// *Lucene/Solr does not require such additional hardware.
// Lu 0 do 10 re 15 su 21 na 31
// uc 1 oe 11 eq 16 uc 22 al 32
// ce 2 es 12 qu 17 ch 23 ha 33
// en 3 no 13 ui 18 ad 24 ar 34
// ne 4 ot 14 ir 19 dd 25 rd 35
// e/ 5 re 20 di 26 dw 36
// /S 6 it 27 wa 37
// So 7 ti 28 ar 38
// ol 8 io 29 re 39
// lr 9 on 30
// 5555666666666677777777778888888888999999999
// 6789012345678901234567890123456789012345678
// *When you talk about processing speed, the
// Wh 40 ab 48 es 56 th 65
// he 41 bo 49 ss 57 he 66
// en 42 ou 50 si 58
// yo 43 ut 51 in 59
// ou 44 pr 52 ng 60
// ta 45 ro 53 sp 61
// al 46 oc 54 pe 62
// lk 47 ce 55 ee 63
// ed 64
protected void MakeIndexStrMV()
// 0123
// "abc"
// 34567
// "defg"
// 111
// 789012
// "hijkl"