132 lines
4.8 KiB
C#
132 lines
4.8 KiB
C#
/*
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing,
|
|
* software distributed under the License is distributed on an
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied. See the License for the
|
|
* specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using Lucene.Net.Analysis;
|
|
using Lucene.Net.Analysis.Cn;
|
|
using Lucene.Net.Analysis.Tokenattributes;
|
|
using Lucene.Net.Test.Analysis;
|
|
using NUnit.Framework;
|
|
|
|
namespace Lucene.Net.Analyzers.Cn
|
|
{
|
|
[TestFixture]
|
|
public class TestChineseTokenizer : BaseTokenStreamTestCase
|
|
{
|
|
[Test]
|
|
public void TestOtherLetterOffset()
|
|
{
|
|
String s = "a天b";
|
|
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
|
|
|
|
int correctStartOffset = 0;
|
|
int correctEndOffset = 1;
|
|
IOffsetAttribute offsetAtt = tokenizer.GetAttribute<IOffsetAttribute>();
|
|
while (tokenizer.IncrementToken())
|
|
{
|
|
Assert.AreEqual(correctStartOffset, offsetAtt.StartOffset);
|
|
Assert.AreEqual(correctEndOffset, offsetAtt.EndOffset);
|
|
correctStartOffset++;
|
|
correctEndOffset++;
|
|
}
|
|
}
|
|
|
|
[Test]
|
|
public void TestReusableTokenStream()
|
|
{
|
|
Analyzer a = new ChineseAnalyzer();
|
|
AssertAnalyzesToReuse(a, "中华人民共和国",
|
|
new String[] { "中", "华", "人", "民", "共", "和", "国" },
|
|
new int[] { 0, 1, 2, 3, 4, 5, 6 },
|
|
new int[] { 1, 2, 3, 4, 5, 6, 7 });
|
|
AssertAnalyzesToReuse(a, "北京市",
|
|
new String[] { "北", "京", "市" },
|
|
new int[] { 0, 1, 2 },
|
|
new int[] { 1, 2, 3 });
|
|
}
|
|
|
|
/*
|
|
* Analyzer that just uses ChineseTokenizer, not ChineseFilter.
|
|
* convenience to show the behavior of the tokenizer
|
|
*/
|
|
private class JustChineseTokenizerAnalyzer : Analyzer
|
|
{
|
|
public override TokenStream TokenStream(String fieldName, TextReader reader)
|
|
{
|
|
return new ChineseTokenizer(reader);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Analyzer that just uses ChineseFilter, not ChineseTokenizer.
|
|
* convenience to show the behavior of the filter.
|
|
*/
|
|
private class JustChineseFilterAnalyzer : Analyzer
|
|
{
|
|
public override TokenStream TokenStream(String fieldName, TextReader reader)
|
|
{
|
|
return new ChineseFilter(new WhitespaceTokenizer(reader));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
|
|
*/
|
|
[Test]
|
|
public void TestNumerics()
|
|
{
|
|
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
|
AssertAnalyzesTo(justTokenizer, "中1234", new String[] {"中", "1234"});
|
|
|
|
// in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
|
|
Analyzer a = new ChineseAnalyzer();
|
|
AssertAnalyzesTo(a, "中1234", new String[] {"中"});
|
|
}
|
|
|
|
/*
|
|
* ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
|
|
* it will lowercase terms automatically.
|
|
*
|
|
* ChineseFilter has an english stopword list, it also removes any single character tokens.
|
|
* the stopword list is case-sensitive.
|
|
*/
|
|
[Test]
|
|
public void TestEnglish()
|
|
{
|
|
Analyzer chinese = new ChineseAnalyzer();
|
|
AssertAnalyzesTo(chinese, "This is a Test. b c d",
|
|
new String[] {"test"});
|
|
|
|
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
|
AssertAnalyzesTo(justTokenizer, "This is a Test. b c d",
|
|
new String[] {"this", "is", "a", "test", "b", "c", "d"});
|
|
|
|
Analyzer justFilter = new JustChineseFilterAnalyzer();
|
|
AssertAnalyzesTo(justFilter, "This is a Test. b c d",
|
|
new String[] {"This", "Test."});
|
|
}
|
|
}
|
|
}
|