474 lines
15 KiB
C#
474 lines
15 KiB
C#
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Text;
|
|
|
|
|
|
using Lucene.Net.Analysis;
|
|
using Lucene.Net.Analysis.Tokenattributes;
|
|
using Lucene.Net.Search;
|
|
using Lucene.Net.Documents;
|
|
using Lucene.Net.QueryParsers;
|
|
using Lucene.Net.Store;
|
|
using Lucene.Net.Index;
|
|
using Lucene.Net.Util;
|
|
|
|
using NUnit.Framework;
|
|
|
|
namespace Lucene.Net.Search.Vectorhighlight
|
|
{
|
|
public abstract class AbstractTestCase
|
|
{
|
|
|
|
protected String F = "f";
|
|
protected String F1 = "f1";
|
|
protected String F2 = "f2";
|
|
protected Directory dir;
|
|
protected Analyzer analyzerW;
|
|
protected Analyzer analyzerB;
|
|
protected Analyzer analyzerK;
|
|
protected IndexReader reader;
|
|
protected QueryParser paW;
|
|
protected QueryParser paB;
|
|
|
|
protected static String[] shortMVValues = {
|
|
"a b c",
|
|
"", // empty data in multi valued field
|
|
"d e"
|
|
};
|
|
|
|
protected static String[] longMVValues = {
|
|
"Followings are the examples of customizable parameters and actual examples of customization:",
|
|
"The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
|
|
};
|
|
|
|
// test data for LUCENE-1448 bug
|
|
protected static String[] biMVValues = {
|
|
"\nLucene/Solr does not require such additional hardware.",
|
|
"\nWhen you talk about processing speed, the"
|
|
};
|
|
|
|
protected static String[] strMVValues = {
|
|
"abc",
|
|
"defg",
|
|
"hijkl"
|
|
};
|
|
|
|
[SetUp]
|
|
public void SetUp()
|
|
{
|
|
analyzerW = new WhitespaceAnalyzer();
|
|
analyzerB = new BigramAnalyzer();
|
|
analyzerK = new KeywordAnalyzer();
|
|
paW = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerW);
|
|
paB = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerB);
|
|
dir = new RAMDirectory();
|
|
}
|
|
|
|
[TearDown]
|
|
public void TearDown()
|
|
{
|
|
if (reader != null)
|
|
{
|
|
reader.Close();
|
|
reader = null;
|
|
}
|
|
}
|
|
|
|
protected Query Tq(String text)
|
|
{
|
|
return Tq(1F, text);
|
|
}
|
|
|
|
protected Query Tq(float boost, String text)
|
|
{
|
|
return Tq(boost, F, text);
|
|
}
|
|
|
|
protected Query Tq(String field, String text)
|
|
{
|
|
return Tq(1F, field, text);
|
|
}
|
|
|
|
protected Query Tq(float boost, String field, String text)
|
|
{
|
|
Query query = new TermQuery(new Term(field, text));
|
|
query.Boost = boost;
|
|
return query;
|
|
}
|
|
|
|
protected Query Preq(String text)
|
|
{
|
|
return Preq(1F, text);
|
|
}
|
|
|
|
protected Query Preq(float boost, String text)
|
|
{
|
|
return Preq(boost, F, text);
|
|
}
|
|
|
|
protected Query Preq(String field, String text)
|
|
{
|
|
return Preq(1F, field, text);
|
|
}
|
|
|
|
protected Query Preq(float boost, String field, String text)
|
|
{
|
|
Query query = new PrefixQuery(new Term(field, text));
|
|
query.Boost = boost;
|
|
return query;
|
|
}
|
|
|
|
protected Query PqF(params String[] texts)
|
|
{
|
|
return PqF(1F, texts);
|
|
}
|
|
|
|
//protected Query pqF(String[] texts)
|
|
//{
|
|
// return pqF(1F, texts);
|
|
//}
|
|
|
|
protected Query PqF(float boost, params String[] texts)
|
|
{
|
|
return pqF(boost, 0, texts);
|
|
}
|
|
|
|
protected Query pqF(float boost, int slop, params String[] texts)
|
|
{
|
|
return Pq(boost, slop, F, texts);
|
|
}
|
|
|
|
protected Query Pq(String field, params String[] texts)
|
|
{
|
|
return Pq(1F, 0, field, texts);
|
|
}
|
|
|
|
protected Query Pq(float boost, String field, params String[] texts)
|
|
{
|
|
return Pq(boost, 0, field, texts);
|
|
}
|
|
|
|
protected Query Pq(float boost, int slop, String field, params String[] texts)
|
|
{
|
|
PhraseQuery query = new PhraseQuery();
|
|
foreach (String text in texts)
|
|
{
|
|
query.Add(new Term(field, text));
|
|
}
|
|
query.Boost = boost;
|
|
query.Slop = slop;
|
|
return query;
|
|
}
|
|
|
|
protected Query Dmq(params Query[] queries)
|
|
{
|
|
return Dmq(0.0F, queries);
|
|
}
|
|
|
|
protected Query Dmq(float tieBreakerMultiplier, params Query[] queries)
|
|
{
|
|
DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier);
|
|
foreach (Query q in queries)
|
|
{
|
|
query.Add(q);
|
|
}
|
|
return query;
|
|
}
|
|
|
|
protected void AssertCollectionQueries(Dictionary<Query, Query> actual, params Query[] expected)
|
|
{
|
|
|
|
Assert.AreEqual(expected.Length, actual.Count);
|
|
foreach (Query query in expected)
|
|
{
|
|
Assert.IsTrue(actual.ContainsKey(query));
|
|
}
|
|
}
|
|
|
|
class BigramAnalyzer : Analyzer
|
|
{
|
|
public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
|
|
{
|
|
return new BasicNGramTokenizer(reader);
|
|
}
|
|
}
|
|
|
|
class BasicNGramTokenizer : Tokenizer
|
|
{
|
|
|
|
public static int DEFAULT_N_SIZE = 2;
|
|
public static String DEFAULT_DELIMITERS = " \t\n.,";
|
|
private int n;
|
|
private String delimiters;
|
|
private int startTerm;
|
|
private int lenTerm;
|
|
private int startOffset;
|
|
private int nextStartOffset;
|
|
private int ch;
|
|
private String snippet;
|
|
private StringBuilder snippetBuffer;
|
|
private static int BUFFER_SIZE = 4096;
|
|
private char[] charBuffer;
|
|
private int charBufferIndex;
|
|
private int charBufferLen;
|
|
|
|
public BasicNGramTokenizer(System.IO.TextReader inReader): this(inReader, DEFAULT_N_SIZE)
|
|
{
|
|
}
|
|
|
|
public BasicNGramTokenizer(System.IO.TextReader inReader, int n): this(inReader, n, DEFAULT_DELIMITERS)
|
|
{
|
|
}
|
|
|
|
public BasicNGramTokenizer(System.IO.TextReader inReader, String delimiters) : this(inReader, DEFAULT_N_SIZE, delimiters)
|
|
{
|
|
}
|
|
|
|
public BasicNGramTokenizer(System.IO.TextReader inReader, int n, String delimiters) : base(inReader)
|
|
{
|
|
this.n = n;
|
|
this.delimiters = delimiters;
|
|
startTerm = 0;
|
|
nextStartOffset = 0;
|
|
snippet = null;
|
|
snippetBuffer = new StringBuilder();
|
|
charBuffer = new char[BUFFER_SIZE];
|
|
charBufferIndex = BUFFER_SIZE;
|
|
charBufferLen = 0;
|
|
ch = 0;
|
|
|
|
Init();
|
|
}
|
|
|
|
void Init()
|
|
{
|
|
termAtt = AddAttribute<ITermAttribute>();
|
|
offsetAtt = AddAttribute<IOffsetAttribute>();
|
|
}
|
|
|
|
ITermAttribute termAtt = null;
|
|
IOffsetAttribute offsetAtt = null;
|
|
|
|
public override bool IncrementToken()
|
|
{
|
|
if (!GetNextPartialSnippet())
|
|
return false;
|
|
ClearAttributes();
|
|
termAtt.SetTermBuffer(snippet, startTerm, lenTerm);
|
|
offsetAtt.SetOffset(CorrectOffset(startOffset), CorrectOffset(startOffset + lenTerm));
|
|
return true;
|
|
}
|
|
|
|
private int GetFinalOffset()
|
|
{
|
|
return nextStartOffset;
|
|
}
|
|
|
|
public override void End()
|
|
{
|
|
offsetAtt.SetOffset(GetFinalOffset(), GetFinalOffset());
|
|
}
|
|
|
|
protected bool GetNextPartialSnippet()
|
|
{
|
|
if (snippet != null && snippet.Length >= startTerm + 1 + n)
|
|
{
|
|
startTerm++;
|
|
startOffset++;
|
|
lenTerm = n;
|
|
return true;
|
|
}
|
|
return GetNextSnippet();
|
|
}
|
|
|
|
protected bool GetNextSnippet()
|
|
{
|
|
startTerm = 0;
|
|
startOffset = nextStartOffset;
|
|
snippetBuffer.Remove(0, snippetBuffer.Length);
|
|
while (true)
|
|
{
|
|
if (ch != -1)
|
|
ch = ReadCharFromBuffer();
|
|
if (ch == -1) break;
|
|
else if (!IsDelimiter(ch))
|
|
snippetBuffer.Append((char)ch);
|
|
else if (snippetBuffer.Length > 0)
|
|
break;
|
|
else
|
|
startOffset++;
|
|
}
|
|
if (snippetBuffer.Length == 0)
|
|
return false;
|
|
snippet = snippetBuffer.ToString();
|
|
lenTerm = snippet.Length >= n ? n : snippet.Length;
|
|
return true;
|
|
}
|
|
|
|
protected int ReadCharFromBuffer()
|
|
{
|
|
if (charBufferIndex >= charBufferLen)
|
|
{
|
|
charBufferLen = input.Read(charBuffer,0,charBuffer.Length);
|
|
if (charBufferLen <= 0)
|
|
{
|
|
return -1;
|
|
}
|
|
charBufferIndex = 0;
|
|
}
|
|
int c = (int)charBuffer[charBufferIndex++];
|
|
nextStartOffset++;
|
|
return c;
|
|
}
|
|
|
|
protected bool IsDelimiter(int c)
|
|
{
|
|
return delimiters.IndexOf(Convert.ToChar(c) ) >= 0;
|
|
}
|
|
}
|
|
|
|
protected void Make1d1fIndex(String value)
|
|
{
|
|
Make1dmfIndex( value );
|
|
}
|
|
|
|
protected void Make1d1fIndexB(String value)
|
|
{
|
|
Make1dmfIndexB( value );
|
|
}
|
|
|
|
protected void Make1dmfIndex(params String[] values)
|
|
{
|
|
Make1dmfIndex(analyzerW, values);
|
|
}
|
|
|
|
protected void Make1dmfIndexB(params String[] values)
|
|
{
|
|
Make1dmfIndex(analyzerB, values);
|
|
}
|
|
|
|
// make 1 doc with multi valued field
|
|
protected void Make1dmfIndex(Analyzer analyzer, params String[] values)
|
|
{
|
|
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
|
Document doc = new Document();
|
|
foreach (String value in values)
|
|
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
|
writer.AddDocument(doc);
|
|
writer.Close();
|
|
|
|
reader = IndexReader.Open(dir,true);
|
|
}
|
|
|
|
// make 1 doc with multi valued & not analyzed field
|
|
protected void Make1dmfIndexNA(String[] values)
|
|
{
|
|
IndexWriter writer = new IndexWriter(dir, analyzerK, true, IndexWriter.MaxFieldLength.LIMITED);
|
|
Document doc = new Document();
|
|
foreach (String value in values)
|
|
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
|
writer.AddDocument(doc);
|
|
writer.Close();
|
|
|
|
reader = IndexReader.Open(dir, true);
|
|
}
|
|
|
|
protected void MakeIndexShortMV()
|
|
{
|
|
|
|
// 012345
|
|
// "a b c"
|
|
// 0 1 2
|
|
|
|
// ""
|
|
|
|
// 6789
|
|
// "d e"
|
|
// 3 4
|
|
Make1dmfIndex(shortMVValues);
|
|
}
|
|
|
|
protected void MakeIndexLongMV()
|
|
{
|
|
// 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
|
|
// 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
|
|
// Followings are the examples of customizable parameters and actual examples of customization:
|
|
// 0 1 2 3 4 5 6 7 8 9 10 11
|
|
|
|
// 1 2
|
|
// 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
|
|
// 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
|
|
// The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
|
|
// 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
|
|
|
|
Make1dmfIndex(longMVValues);
|
|
}
|
|
|
|
protected void MakeIndexLongMVB()
|
|
{
|
|
// "*" [] LF
|
|
|
|
// 1111111111222222222233333333334444444444555555
|
|
// 01234567890123456789012345678901234567890123456789012345
|
|
// *Lucene/Solr does not require such additional hardware.
|
|
// Lu 0 do 10 re 15 su 21 na 31
|
|
// uc 1 oe 11 eq 16 uc 22 al 32
|
|
// ce 2 es 12 qu 17 ch 23 ha 33
|
|
// en 3 no 13 ui 18 ad 24 ar 34
|
|
// ne 4 ot 14 ir 19 dd 25 rd 35
|
|
// e/ 5 re 20 di 26 dw 36
|
|
// /S 6 it 27 wa 37
|
|
// So 7 ti 28 ar 38
|
|
// ol 8 io 29 re 39
|
|
// lr 9 on 30
|
|
|
|
// 5555666666666677777777778888888888999999999
|
|
// 6789012345678901234567890123456789012345678
|
|
// *When you talk about processing speed, the
|
|
// Wh 40 ab 48 es 56 th 65
|
|
// he 41 bo 49 ss 57 he 66
|
|
// en 42 ou 50 si 58
|
|
// yo 43 ut 51 in 59
|
|
// ou 44 pr 52 ng 60
|
|
// ta 45 ro 53 sp 61
|
|
// al 46 oc 54 pe 62
|
|
// lk 47 ce 55 ee 63
|
|
// ed 64
|
|
|
|
Make1dmfIndexB(biMVValues);
|
|
}
|
|
|
|
protected void MakeIndexStrMV()
|
|
{
|
|
// 0123
|
|
// "abc"
|
|
|
|
// 34567
|
|
// "defg"
|
|
|
|
// 111
|
|
// 789012
|
|
// "hijkl"
|
|
Make1dmfIndexNA(strMVValues);
|
|
}
|
|
}
|
|
}
|