328 lines
17 KiB
C#
328 lines
17 KiB
C#
/*
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing,
|
|
* software distributed under the License is distributed on an
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied. See the License for the
|
|
* specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using Lucene.Net.Analysis;
|
|
using Lucene.Net.Analysis.CJK;
|
|
using Lucene.Net.Test.Analysis;
|
|
using NUnit.Framework;
|
|
using Version = Lucene.Net.Util.Version;
|
|
|
|
namespace Lucene.Net.Analyzers.Cjk
|
|
{
|
|
[TestFixture]
|
|
public class TestCJKTokenizer : BaseTokenStreamTestCase
|
|
{
|
|
|
|
public class TestToken
|
|
{
|
|
protected internal String termText;
|
|
protected internal int start;
|
|
protected internal int end;
|
|
protected internal String type;
|
|
}
|
|
|
|
public TestToken NewToken(String termText, int start, int end, int type)
|
|
{
|
|
TestToken token = new TestToken();
|
|
token.termText = termText;
|
|
token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
|
|
token.start = start;
|
|
token.end = end;
|
|
return token;
|
|
}
|
|
|
|
public void CheckCjkToken(String str, TestToken[] out_tokens)
|
|
{
|
|
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
|
String[] terms = new String[out_tokens.Length];
|
|
int[] startOffsets = new int[out_tokens.Length];
|
|
int[] endOffsets = new int[out_tokens.Length];
|
|
String[] types = new String[out_tokens.Length];
|
|
for (int i = 0; i < out_tokens.Length; i++)
|
|
{
|
|
terms[i] = out_tokens[i].termText;
|
|
startOffsets[i] = out_tokens[i].start;
|
|
endOffsets[i] = out_tokens[i].end;
|
|
types[i] = out_tokens[i].type;
|
|
}
|
|
AssertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
|
|
}
|
|
|
|
public void CheckCjkTokenReusable(Analyzer a, String str, TestToken[] out_tokens)
|
|
{
|
|
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
|
String[] terms = new String[out_tokens.Length];
|
|
int[] startOffsets = new int[out_tokens.Length];
|
|
int[] endOffsets = new int[out_tokens.Length];
|
|
String[] types = new String[out_tokens.Length];
|
|
for (int i = 0; i < out_tokens.Length; i++)
|
|
{
|
|
terms[i] = out_tokens[i].termText;
|
|
startOffsets[i] = out_tokens[i].start;
|
|
endOffsets[i] = out_tokens[i].end;
|
|
types[i] = out_tokens[i].type;
|
|
}
|
|
AssertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
|
|
}
|
|
|
|
[Test]
|
|
public void TestJa1()
|
|
{
|
|
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e5d\u5341", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestJa2()
|
|
{
|
|
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u5341", 12, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestC()
|
|
{
|
|
String str = "abc defgh ijklmn opqrstu vwxy z";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestMix()
|
|
{
|
|
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestMix2()
|
|
{
|
|
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestSingleChar()
|
|
{
|
|
String str = "\u4e00";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
/*
|
|
* Full-width text is normalized to half-width
|
|
*/
|
|
[Test]
|
|
public void TestFullWidth()
|
|
{
|
|
String str = "Test 1234";
|
|
TestToken[] out_tokens = {
|
|
NewToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
/*
|
|
* Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
|
|
*/
|
|
[Test]
|
|
public void TestNonIdeographic()
|
|
{
|
|
String str = "\u4e00 روبرت موير";
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
/*
|
|
* Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
|
|
* except for words are split around non-letters.
|
|
*/
|
|
[Test]
|
|
public void TestNonIdeographicNonLetter()
|
|
{
|
|
String str = "\u4e00 رُوبرت موير";
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkToken(str, out_tokens);
|
|
}
|
|
|
|
[Test]
|
|
public void TestTokenStream()
|
|
{
|
|
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
|
AssertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
|
|
new String[] {"\u4e00\u4e01", "\u4e01\u4e02"});
|
|
}
|
|
|
|
[Test]
|
|
public void TestReusableTokenStream()
|
|
{
|
|
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
|
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
|
|
|
TestToken[] out_tokens = {
|
|
NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304f\u3051", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3051\u3053", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkTokenReusable(analyzer, str, out_tokens);
|
|
|
|
str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
|
TestToken[] out_tokens2 = {
|
|
NewToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u304f\u3051", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("\u3053", 14, 15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
};
|
|
CheckCjkTokenReusable(analyzer, str, out_tokens2);
|
|
}
|
|
|
|
/*
|
|
* LUCENE-2207: wrong offset calculated by end()
|
|
*/
|
|
[Test]
|
|
public void TestFinalOffset()
|
|
{
|
|
CheckCjkToken("あい", new TestToken[]
|
|
{
|
|
NewToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
});
|
|
CheckCjkToken("あい ", new TestToken[]
|
|
{
|
|
NewToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
});
|
|
CheckCjkToken("test", new TestToken[]
|
|
{
|
|
NewToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
|
});
|
|
CheckCjkToken("test ", new TestToken[]
|
|
{
|
|
NewToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
|
});
|
|
CheckCjkToken("あいtest", new TestToken[]
|
|
{
|
|
NewToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
|
NewToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
|
});
|
|
CheckCjkToken("testあい ", new TestToken[]
|
|
{
|
|
NewToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
|
NewToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
|
});
|
|
}
|
|
}
|
|
}
|