113 lines
4.0 KiB
C#
113 lines
4.0 KiB
C#
|
/*
|
|||
|
*
|
|||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|||
|
* or more contributor license agreements. See the NOTICE file
|
|||
|
* distributed with this work for additional information
|
|||
|
* regarding copyright ownership. The ASF licenses this file
|
|||
|
* to you under the Apache License, Version 2.0 (the
|
|||
|
* "License"); you may not use this file except in compliance
|
|||
|
* with the License. You may obtain a copy of the License at
|
|||
|
*
|
|||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
*
|
|||
|
* Unless required by applicable law or agreed to in writing,
|
|||
|
* software distributed under the License is distributed on an
|
|||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|||
|
* KIND, either express or implied. See the License for the
|
|||
|
* specific language governing permissions and limitations
|
|||
|
* under the License.
|
|||
|
*
|
|||
|
*/
|
|||
|
|
|||
|
using System;
|
|||
|
using System.Collections.Generic;
|
|||
|
using System.IO;
|
|||
|
using System.Linq;
|
|||
|
using System.Text;
|
|||
|
using Lucene.Net.Analysis;
|
|||
|
using Lucene.Net.Analysis.Ru;
|
|||
|
using Lucene.Net.Analysis.Tokenattributes;
|
|||
|
using Lucene.Net.Test.Analysis;
|
|||
|
using NUnit.Framework;
|
|||
|
using Version=Lucene.Net.Util.Version;
|
|||
|
|
|||
|
namespace Lucene.Net.Analyzers.Ru
|
|||
|
{
|
|||
|
/*
|
|||
|
* Test case for RussianAnalyzer.
|
|||
|
*/
|
|||
|
[TestFixture]
|
|||
|
public class TestRussianAnalyzer : BaseTokenStreamTestCase
|
|||
|
{
|
|||
|
private StreamReader inWords;
|
|||
|
private StreamReader sampleUnicode;
|
|||
|
|
|||
|
//protected override void SetUp()
|
|||
|
//{
|
|||
|
// base.SetUp();
|
|||
|
//}
|
|||
|
|
|||
|
[Test]
|
|||
|
public void TestUnicode()
|
|||
|
{
|
|||
|
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
|||
|
|
|||
|
using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
|
|||
|
using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
|
|||
|
{
|
|||
|
|
|||
|
TokenStream _in = ra.TokenStream("all", inWords);
|
|||
|
|
|||
|
RussianLetterTokenizer sample =
|
|||
|
new RussianLetterTokenizer(
|
|||
|
sampleUnicode);
|
|||
|
|
|||
|
ITermAttribute text = _in.GetAttribute<ITermAttribute>();
|
|||
|
ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>();
|
|||
|
|
|||
|
for (; ; )
|
|||
|
{
|
|||
|
if (_in.IncrementToken() == false)
|
|||
|
break;
|
|||
|
|
|||
|
bool nextSampleToken = sample.IncrementToken();
|
|||
|
Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
[Test]
|
|||
|
public void TestDigitsInRussianCharset()
|
|||
|
{
|
|||
|
TextReader reader = new StringReader("text 1000");
|
|||
|
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
|||
|
TokenStream stream = ra.TokenStream("", reader);
|
|||
|
|
|||
|
ITermAttribute termText = stream.GetAttribute<ITermAttribute>();
|
|||
|
|
|||
|
try
|
|||
|
{
|
|||
|
Assert.True(stream.IncrementToken());
|
|||
|
Assert.AreEqual("text", termText.Term);
|
|||
|
Assert.True(stream.IncrementToken());
|
|||
|
Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text");
|
|||
|
Assert.False(stream.IncrementToken());
|
|||
|
}
|
|||
|
catch (IOException e)
|
|||
|
{
|
|||
|
Assert.Fail("unexpected IOException");
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
[Test]
|
|||
|
public void TestReusableTokenStream()
|
|||
|
{
|
|||
|
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
|||
|
AssertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
|||
|
new String[] {"вмест", "сил", "электромагнитн", "энерг", "имел", "представлен"});
|
|||
|
AssertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
|||
|
new String[] {"знан", "хран", "тайн"});
|
|||
|
}
|
|||
|
}
|
|||
|
}
|