180 lines
7.3 KiB
C#
180 lines
7.3 KiB
C#
/*
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing,
|
|
* software distributed under the License is distributed on an
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied. See the License for the
|
|
* specific language governing permissions and limitations
|
|
* under the License.
|
|
*
|
|
*/
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using Lucene.Net.Analysis;
|
|
using Lucene.Net.Analysis.BR;
|
|
using Lucene.Net.Test.Analysis;
|
|
using NUnit.Framework;
|
|
using Version = Lucene.Net.Util.Version;
|
|
|
|
namespace Lucene.Net.Analyzers.Br
|
|
{
|
|
/*
|
|
* Test the Brazilian Stem Filter, which only modifies the term text.
|
|
*
|
|
* It is very similar to the snowball portuguese algorithm but not exactly the same.
|
|
*
|
|
*/
|
|
[TestFixture]
|
|
public class TestBrazilianStemmer : BaseTokenStreamTestCase
|
|
{
|
|
[Test]
|
|
public void TestWithSnowballExamples()
|
|
{
|
|
Check("boa", "boa");
|
|
Check("boainain", "boainain");
|
|
Check("boas", "boas");
|
|
Check("bôas", "boas"); // removes diacritic: different from snowball portugese
|
|
Check("boassu", "boassu");
|
|
Check("boataria", "boat");
|
|
Check("boate", "boat");
|
|
Check("boates", "boat");
|
|
Check("boatos", "boat");
|
|
Check("bob", "bob");
|
|
Check("boba", "bob");
|
|
Check("bobagem", "bobag");
|
|
Check("bobagens", "bobagens");
|
|
Check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
|
|
Check("bobear", "bob");
|
|
Check("bobeira", "bobeir");
|
|
Check("bobinho", "bobinh");
|
|
Check("bobinhos", "bobinh");
|
|
Check("bobo", "bob");
|
|
Check("bobs", "bobs");
|
|
Check("boca", "boc");
|
|
Check("bocadas", "boc");
|
|
Check("bocadinho", "bocadinh");
|
|
Check("bocado", "boc");
|
|
Check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
|
|
Check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
|
|
Check("bocarra", "bocarr");
|
|
Check("bocas", "boc");
|
|
Check("bode", "bod");
|
|
Check("bodoque", "bodoqu");
|
|
Check("body", "body");
|
|
Check("boeing", "boeing");
|
|
Check("boem", "boem");
|
|
Check("boemia", "boem");
|
|
Check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
|
|
Check("bogotá", "bogot");
|
|
Check("boi", "boi");
|
|
Check("bóia", "boi"); // removes diacritic: different from snowball portuguese
|
|
Check("boiando", "boi");
|
|
Check("quiabo", "quiab");
|
|
Check("quicaram", "quic");
|
|
Check("quickly", "quickly");
|
|
Check("quieto", "quiet");
|
|
Check("quietos", "quiet");
|
|
Check("quilate", "quilat");
|
|
Check("quilates", "quilat");
|
|
Check("quilinhos", "quilinh");
|
|
Check("quilo", "quil");
|
|
Check("quilombo", "quilomb");
|
|
Check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
|
|
Check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
|
|
Check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
|
|
Check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
|
|
Check("quilos", "quil");
|
|
Check("quimica", "quimic");
|
|
Check("quilos", "quil");
|
|
Check("quimica", "quimic");
|
|
Check("quimicas", "quimic");
|
|
Check("quimico", "quimic");
|
|
Check("quimicos", "quimic");
|
|
Check("quimioterapia", "quimioterap");
|
|
Check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
|
|
Check("quimono", "quimon");
|
|
Check("quincas", "quinc");
|
|
Check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
|
|
Check("quinhentos", "quinhent");
|
|
Check("quinn", "quinn");
|
|
Check("quino", "quin");
|
|
Check("quinta", "quint");
|
|
Check("quintal", "quintal");
|
|
Check("quintana", "quintan");
|
|
Check("quintanilha", "quintanilh");
|
|
Check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
|
|
Check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
|
|
Check("quintino", "quintin");
|
|
Check("quinto", "quint");
|
|
Check("quintos", "quint");
|
|
Check("quintuplicou", "quintuplic");
|
|
Check("quinze", "quinz");
|
|
Check("quinzena", "quinzen");
|
|
Check("quiosque", "quiosqu");
|
|
}
|
|
|
|
public void TestNormalization()
|
|
{
|
|
Check("Brasil", "brasil"); // lowercase by default
|
|
Check("Brasília", "brasil"); // remove diacritics
|
|
Check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
|
|
Check("áá", "áá"); // token is too short: diacritics are not removed
|
|
Check("ááá", "aaa"); // normally, diacritics are removed
|
|
}
|
|
|
|
[Test]
|
|
public void TestReusableTokenStream()
|
|
{
|
|
Analyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
|
CheckReuse(a, "boa", "boa");
|
|
CheckReuse(a, "boainain", "boainain");
|
|
CheckReuse(a, "boas", "boas");
|
|
CheckReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
|
|
}
|
|
|
|
[Test]
|
|
public void TestStemExclusionTable()
|
|
{
|
|
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
|
a.SetStemExclusionTable(new String[] { "quintessência" });
|
|
CheckReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
|
}
|
|
|
|
/*
|
|
* Test that changes to the exclusion table are applied immediately
|
|
* when using reusable token streams.
|
|
*/
|
|
[Test]
|
|
public void TestExclusionTableReuse()
|
|
{
|
|
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
|
CheckReuse(a, "quintessência", "quintessente");
|
|
a.SetStemExclusionTable(new String[] { "quintessência" });
|
|
CheckReuse(a, "quintessência", "quintessência");
|
|
}
|
|
|
|
private void Check(String input, String expected)
|
|
{
|
|
CheckOneTerm(new BrazilianAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
|
}
|
|
|
|
private void CheckReuse(Analyzer a, String input, String expected)
|
|
{
|
|
CheckOneTermReuse(a, input, expected);
|
|
}
|
|
}
|
|
}
|