113 lines
4.0 KiB
C#
113 lines
4.0 KiB
C#
/*
|
||
*
|
||
* Licensed to the Apache Software Foundation (ASF) under one
|
||
* or more contributor license agreements. See the NOTICE file
|
||
* distributed with this work for additional information
|
||
* regarding copyright ownership. The ASF licenses this file
|
||
* to you under the Apache License, Version 2.0 (the
|
||
* "License"); you may not use this file except in compliance
|
||
* with the License. You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing,
|
||
* software distributed under the License is distributed on an
|
||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
* KIND, either express or implied. See the License for the
|
||
* specific language governing permissions and limitations
|
||
* under the License.
|
||
*
|
||
*/
|
||
|
||
using System;
|
||
using System.Collections.Generic;
|
||
using System.IO;
|
||
using System.Linq;
|
||
using System.Text;
|
||
using Lucene.Net.Analysis;
|
||
using Lucene.Net.Analysis.Ru;
|
||
using Lucene.Net.Analysis.Tokenattributes;
|
||
using Lucene.Net.Test.Analysis;
|
||
using NUnit.Framework;
|
||
using Version=Lucene.Net.Util.Version;
|
||
|
||
namespace Lucene.Net.Analyzers.Ru
|
||
{
|
||
/*
|
||
* Test case for RussianAnalyzer.
|
||
*/
|
||
[TestFixture]
|
||
public class TestRussianAnalyzer : BaseTokenStreamTestCase
|
||
{
|
||
private StreamReader inWords;
|
||
private StreamReader sampleUnicode;
|
||
|
||
//protected override void SetUp()
|
||
//{
|
||
// base.SetUp();
|
||
//}
|
||
|
||
[Test]
|
||
public void TestUnicode()
|
||
{
|
||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||
|
||
using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
|
||
using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
|
||
{
|
||
|
||
TokenStream _in = ra.TokenStream("all", inWords);
|
||
|
||
RussianLetterTokenizer sample =
|
||
new RussianLetterTokenizer(
|
||
sampleUnicode);
|
||
|
||
ITermAttribute text = _in.GetAttribute<ITermAttribute>();
|
||
ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>();
|
||
|
||
for (; ; )
|
||
{
|
||
if (_in.IncrementToken() == false)
|
||
break;
|
||
|
||
bool nextSampleToken = sample.IncrementToken();
|
||
Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
|
||
}
|
||
}
|
||
}
|
||
|
||
[Test]
|
||
public void TestDigitsInRussianCharset()
|
||
{
|
||
TextReader reader = new StringReader("text 1000");
|
||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||
TokenStream stream = ra.TokenStream("", reader);
|
||
|
||
ITermAttribute termText = stream.GetAttribute<ITermAttribute>();
|
||
|
||
try
|
||
{
|
||
Assert.True(stream.IncrementToken());
|
||
Assert.AreEqual("text", termText.Term);
|
||
Assert.True(stream.IncrementToken());
|
||
Assert.AreEqual("1000", termText.Term, "RussianAnalyzer's tokenizer skips numbers from input text");
|
||
Assert.False(stream.IncrementToken());
|
||
}
|
||
catch (IOException e)
|
||
{
|
||
Assert.Fail("unexpected IOException");
|
||
}
|
||
}
|
||
|
||
[Test]
|
||
public void TestReusableTokenStream()
|
||
{
|
||
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
||
AssertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||
new String[] {"вмест", "сил", "электромагнитн", "энерг", "имел", "представлен"});
|
||
AssertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||
new String[] {"знан", "хран", "тайн"});
|
||
}
|
||
}
|
||
}
|