Jo Shields a575963da9 Imported Upstream version 3.6.0
Former-commit-id: da6be194a6b1221998fc28233f2503bd61dd9d14
2014-08-13 10:39:27 +01:00

180 lines
7.3 KiB
C#

/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.BR;
using Lucene.Net.Test.Analysis;
using NUnit.Framework;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analyzers.Br
{
/*
* Test the Brazilian Stem Filter, which only modifies the term text.
*
* It is very similar to the snowball portuguese algorithm but not exactly the same.
*
*/
[TestFixture]
public class TestBrazilianStemmer : BaseTokenStreamTestCase
{
[Test]
public void TestWithSnowballExamples()
{
Check("boa", "boa");
Check("boainain", "boainain");
Check("boas", "boas");
Check("bôas", "boas"); // removes diacritic: different from snowball portugese
Check("boassu", "boassu");
Check("boataria", "boat");
Check("boate", "boat");
Check("boates", "boat");
Check("boatos", "boat");
Check("bob", "bob");
Check("boba", "bob");
Check("bobagem", "bobag");
Check("bobagens", "bobagens");
Check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
Check("bobear", "bob");
Check("bobeira", "bobeir");
Check("bobinho", "bobinh");
Check("bobinhos", "bobinh");
Check("bobo", "bob");
Check("bobs", "bobs");
Check("boca", "boc");
Check("bocadas", "boc");
Check("bocadinho", "bocadinh");
Check("bocado", "boc");
Check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
Check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
Check("bocarra", "bocarr");
Check("bocas", "boc");
Check("bode", "bod");
Check("bodoque", "bodoqu");
Check("body", "body");
Check("boeing", "boeing");
Check("boem", "boem");
Check("boemia", "boem");
Check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
Check("bogotá", "bogot");
Check("boi", "boi");
Check("bóia", "boi"); // removes diacritic: different from snowball portuguese
Check("boiando", "boi");
Check("quiabo", "quiab");
Check("quicaram", "quic");
Check("quickly", "quickly");
Check("quieto", "quiet");
Check("quietos", "quiet");
Check("quilate", "quilat");
Check("quilates", "quilat");
Check("quilinhos", "quilinh");
Check("quilo", "quil");
Check("quilombo", "quilomb");
Check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
Check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
Check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
Check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
Check("quilos", "quil");
Check("quimica", "quimic");
Check("quilos", "quil");
Check("quimica", "quimic");
Check("quimicas", "quimic");
Check("quimico", "quimic");
Check("quimicos", "quimic");
Check("quimioterapia", "quimioterap");
Check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
Check("quimono", "quimon");
Check("quincas", "quinc");
Check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
Check("quinhentos", "quinhent");
Check("quinn", "quinn");
Check("quino", "quin");
Check("quinta", "quint");
Check("quintal", "quintal");
Check("quintana", "quintan");
Check("quintanilha", "quintanilh");
Check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
Check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
Check("quintino", "quintin");
Check("quinto", "quint");
Check("quintos", "quint");
Check("quintuplicou", "quintuplic");
Check("quinze", "quinz");
Check("quinzena", "quinzen");
Check("quiosque", "quiosqu");
}
public void TestNormalization()
{
Check("Brasil", "brasil"); // lowercase by default
Check("Brasília", "brasil"); // remove diacritics
Check("quimio5terápicos", "quimio5terapicos"); // contains non-letter, diacritic will still be removed
Check("áá", "áá"); // token is too short: diacritics are not removed
Check("ááá", "aaa"); // normally, diacritics are removed
}
[Test]
public void TestReusableTokenStream()
{
Analyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
CheckReuse(a, "boa", "boa");
CheckReuse(a, "boainain", "boainain");
CheckReuse(a, "boas", "boas");
CheckReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
}
[Test]
public void TestStemExclusionTable()
{
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
a.SetStemExclusionTable(new String[] { "quintessência" });
CheckReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
[Test]
public void TestExclusionTableReuse()
{
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
CheckReuse(a, "quintessência", "quintessente");
a.SetStemExclusionTable(new String[] { "quintessência" });
CheckReuse(a, "quintessência", "quintessência");
}
private void Check(String input, String expected)
{
CheckOneTerm(new BrazilianAnalyzer(Version.LUCENE_CURRENT), input, expected);
}
private void CheckReuse(Analyzer a, String input, String expected)
{
CheckOneTermReuse(a, input, expected);
}
}
}