180 lines
6.3 KiB
C#
Raw Normal View History

/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Fr;
using Lucene.Net.Test.Analysis;
using NUnit.Framework;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analyzers.Fr
{
/*
* Test case for FrenchAnalyzer.
*
* @version $version$
*/
[TestFixture]
public class TestFrenchAnalyzer : BaseTokenStreamTestCase
{
[Test]
public void TestAnalyzer()
{
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
AssertAnalyzesTo(fa, "", new String[0]);
AssertAnalyzesTo(
fa,
"chien chat cheval",
new String[] {"chien", "chat", "cheval"});
AssertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] {"chien", "chat", "cheval"});
AssertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] {"chien", "chat", "cheval"});
AssertAnalyzesTo(fa, "chien++", new String[] {"chien"});
AssertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] {"mot", "entreguillemet"});
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
AssertAnalyzesTo(
fa,
"Jean-François",
new String[] {"jean", "françois"});
// 2. stopwords
AssertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] {"chien", "chat", "cheval"});
// some nouns and adjectives
AssertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[]
{
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif"
});
// some verbs
AssertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] {"fin", "souffr", "rug"});
// some everything else
// aujourd'hui stays one term which is OK
AssertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[]
{
"c3po",
"aujourd'hui",
"oeuf",
"ïâöûàä",
"anticonstitutionnel",
"jav"
});
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
AssertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] {"33bis", "1940-1945", "1940", "1945", "i"});
AssertAnalyzesTo(fa, "abbeaux abdication abdications abondamment marieuses pageaux", new[]
{
"abbeau",
"abdiqu",
"abdiqu",
"abond",
"marieux",
"pageau"
});
}
[Test]
public void TestReusableTokenStream()
{
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
// stopwords
AssertAnalyzesToReuse(
fa,
"le la chien les aux chat du des à cheval",
new String[] {"chien", "chat", "cheval"});
// some nouns and adjectives
AssertAnalyzesToReuse(
fa,
"lances chismes habitable chiste éléments captifs",
new String[]
{
"lanc",
"chism",
"habit",
"chist",
"élément",
"captif"
});
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
*/
[Test]
public void TestExclusionTableReuse()
{
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
AssertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
fa.SetStemExclusionTable(new String[] { "habitable" });
AssertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
}
}
}