180 lines
6.3 KiB
C#
180 lines
6.3 KiB
C#
|
/*
|
||
|
*
|
||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||
|
* or more contributor license agreements. See the NOTICE file
|
||
|
* distributed with this work for additional information
|
||
|
* regarding copyright ownership. The ASF licenses this file
|
||
|
* to you under the Apache License, Version 2.0 (the
|
||
|
* "License"); you may not use this file except in compliance
|
||
|
* with the License. You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing,
|
||
|
* software distributed under the License is distributed on an
|
||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
* KIND, either express or implied. See the License for the
|
||
|
* specific language governing permissions and limitations
|
||
|
* under the License.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
using System;
|
||
|
using System.Collections.Generic;
|
||
|
using System.Linq;
|
||
|
using System.Text;
|
||
|
using Lucene.Net.Analysis;
|
||
|
using Lucene.Net.Analysis.Fr;
|
||
|
using Lucene.Net.Test.Analysis;
|
||
|
using NUnit.Framework;
|
||
|
using Version = Lucene.Net.Util.Version;
|
||
|
|
||
|
namespace Lucene.Net.Analyzers.Fr
|
||
|
{
|
||
|
/*
|
||
|
* Test case for FrenchAnalyzer.
|
||
|
*
|
||
|
* @version $version$
|
||
|
*/
|
||
|
[TestFixture]
|
||
|
public class TestFrenchAnalyzer : BaseTokenStreamTestCase
|
||
|
{
|
||
|
[Test]
|
||
|
public void TestAnalyzer()
|
||
|
{
|
||
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||
|
|
||
|
AssertAnalyzesTo(fa, "", new String[0]);
|
||
|
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"chien chat cheval",
|
||
|
new String[] {"chien", "chat", "cheval"});
|
||
|
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"chien CHAT CHEVAL",
|
||
|
new String[] {"chien", "chat", "cheval"});
|
||
|
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
" chien ,? + = - CHAT /: > CHEVAL",
|
||
|
new String[] {"chien", "chat", "cheval"});
|
||
|
|
||
|
AssertAnalyzesTo(fa, "chien++", new String[] {"chien"});
|
||
|
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"mot \"entreguillemet\"",
|
||
|
new String[] {"mot", "entreguillemet"});
|
||
|
|
||
|
// let's do some french specific tests now
|
||
|
|
||
|
/* 1. couldn't resist
|
||
|
I would expect this to stay one term as in French the minus
|
||
|
sign is often used for composing words */
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"Jean-François",
|
||
|
new String[] {"jean", "françois"});
|
||
|
|
||
|
// 2. stopwords
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"le la chien les aux chat du des à cheval",
|
||
|
new String[] {"chien", "chat", "cheval"});
|
||
|
|
||
|
// some nouns and adjectives
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"lances chismes habitable chiste éléments captifs",
|
||
|
new String[]
|
||
|
{
|
||
|
"lanc",
|
||
|
"chism",
|
||
|
"habit",
|
||
|
"chist",
|
||
|
"élément",
|
||
|
"captif"
|
||
|
});
|
||
|
|
||
|
// some verbs
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"finissions souffrirent rugissante",
|
||
|
new String[] {"fin", "souffr", "rug"});
|
||
|
|
||
|
// some everything else
|
||
|
// aujourd'hui stays one term which is OK
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||
|
new String[]
|
||
|
{
|
||
|
"c3po",
|
||
|
"aujourd'hui",
|
||
|
"oeuf",
|
||
|
"ïâöûàä",
|
||
|
"anticonstitutionnel",
|
||
|
"jav"
|
||
|
});
|
||
|
|
||
|
// some more everything else
|
||
|
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||
|
AssertAnalyzesTo(
|
||
|
fa,
|
||
|
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||
|
new String[] {"33bis", "1940-1945", "1940", "1945", "i"});
|
||
|
|
||
|
|
||
|
AssertAnalyzesTo(fa, "abbeaux abdication abdications abondamment marieuses pageaux", new[]
|
||
|
{
|
||
|
"abbeau",
|
||
|
"abdiqu",
|
||
|
"abdiqu",
|
||
|
"abond",
|
||
|
"marieux",
|
||
|
"pageau"
|
||
|
});
|
||
|
}
|
||
|
|
||
|
[Test]
|
||
|
public void TestReusableTokenStream()
|
||
|
{
|
||
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||
|
// stopwords
|
||
|
AssertAnalyzesToReuse(
|
||
|
fa,
|
||
|
"le la chien les aux chat du des à cheval",
|
||
|
new String[] {"chien", "chat", "cheval"});
|
||
|
|
||
|
// some nouns and adjectives
|
||
|
AssertAnalyzesToReuse(
|
||
|
fa,
|
||
|
"lances chismes habitable chiste éléments captifs",
|
||
|
new String[]
|
||
|
{
|
||
|
"lanc",
|
||
|
"chism",
|
||
|
"habit",
|
||
|
"chist",
|
||
|
"élément",
|
||
|
"captif"
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Test that changes to the exclusion table are applied immediately
|
||
|
* when using reusable token streams.
|
||
|
*/
|
||
|
[Test]
|
||
|
public void TestExclusionTableReuse()
|
||
|
{
|
||
|
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
||
|
AssertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
|
||
|
fa.SetStemExclusionTable(new String[] { "habitable" });
|
||
|
AssertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
|
||
|
}
|
||
|
}
|
||
|
}
|