Merge tag 'upstream/4.0.5.1' into gorillapony
Upstream version 4.0.5.1 Former-commit-id: c6122b9540554e2f0aa7a2ada59709f00398fd9d
This commit is contained in:
commit
b38e6c88f4
@ -85,7 +85,7 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
|
|||||||
$(srcdir)/config.h.in mkinstalldirs \
|
$(srcdir)/config.h.in mkinstalldirs \
|
||||||
$(srcdir)/mono-core.spec.in $(srcdir)/mono-uninstalled.pc.in \
|
$(srcdir)/mono-core.spec.in $(srcdir)/mono-uninstalled.pc.in \
|
||||||
AUTHORS COPYING.LIB ChangeLog NEWS compile config.guess \
|
AUTHORS COPYING.LIB ChangeLog NEWS compile config.guess \
|
||||||
config.rpath config.sub install-sh missing ltmain.sh
|
config.rpath config.sub depcomp install-sh missing ltmain.sh
|
||||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||||
am__aclocal_m4_deps = $(top_srcdir)/m4/iconv.m4 \
|
am__aclocal_m4_deps = $(top_srcdir)/m4/iconv.m4 \
|
||||||
$(top_srcdir)/m4/lib-ld.m4 $(top_srcdir)/m4/lib-link.m4 \
|
$(top_srcdir)/m4/lib-ld.m4 $(top_srcdir)/m4/lib-link.m4 \
|
||||||
|
@ -1 +1 @@
|
|||||||
0153a4d763e8fdec1bfc6ad8a10db68ae9a8b3ad
|
94ef65996a186293ae651e688580672f8de4b880
|
@ -1 +1 @@
|
|||||||
4d5ca3baf19b55971eba3a1e4c430304031594ed
|
6e1f9bd8a55b889798c88191650b5752d2b58e3b
|
6
external/Lucene.Net.Light/README.md
vendored
Normal file
6
external/Lucene.Net.Light/README.md
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
This is a subset of Lucene.Net as used by Mono's Monodoc tool.
|
||||||
|
|
||||||
|
This module is a checout of:
|
||||||
|
|
||||||
|
git://github.com/mono/lucene.net.git
|
||||||
|
88fb67b07621dfed054d8d75fd50672fb26349df
|
1
external/Lucene.Net.Light/src/core/Analysis/ASCIIFoldingFilter.cs.REMOVED.git-id
vendored
Normal file
1
external/Lucene.Net.Light/src/core/Analysis/ASCIIFoldingFilter.cs.REMOVED.git-id
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
61338700516ffa26e2a36fef4a0843a5fbf01c62
|
171
external/Lucene.Net.Light/src/core/Analysis/Analyzer.cs
vendored
Normal file
171
external/Lucene.Net.Light/src/core/Analysis/Analyzer.cs
vendored
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using Lucene.Net.Documents;
|
||||||
|
using Lucene.Net.Store;
|
||||||
|
using Lucene.Net.Util;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
/// <summary>An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
||||||
|
/// policy for extracting index terms from text.
|
||||||
|
/// <p/>
|
||||||
|
/// Typical implementations first build a Tokenizer, which breaks the stream of
|
||||||
|
/// characters from the Reader into raw Tokens. One or more TokenFilters may
|
||||||
|
/// then be applied to the output of the Tokenizer.
|
||||||
|
/// </summary>
|
||||||
|
public abstract class Analyzer : IDisposable
|
||||||
|
{
|
||||||
|
/// <summary>Creates a TokenStream which tokenizes all the text in the provided
|
||||||
|
/// Reader. Must be able to handle null field name for
|
||||||
|
/// backward compatibility.
|
||||||
|
/// </summary>
|
||||||
|
public abstract TokenStream TokenStream(String fieldName, System.IO.TextReader reader);
|
||||||
|
|
||||||
|
/// <summary>Creates a TokenStream that is allowed to be re-used
|
||||||
|
/// from the previous time that the same thread called
|
||||||
|
/// this method. Callers that do not need to use more
|
||||||
|
/// than one TokenStream at the same time from this
|
||||||
|
/// analyzer should use this method for better
|
||||||
|
/// performance.
|
||||||
|
/// </summary>
|
||||||
|
public virtual TokenStream ReusableTokenStream(String fieldName, System.IO.TextReader reader)
|
||||||
|
{
|
||||||
|
return TokenStream(fieldName, reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
|
||||||
|
private bool isDisposed;
|
||||||
|
|
||||||
|
/// <summary>Used by Analyzers that implement reusableTokenStream
|
||||||
|
/// to retrieve previously saved TokenStreams for re-use
|
||||||
|
/// by the same thread.
|
||||||
|
/// </summary>
|
||||||
|
protected internal virtual object PreviousTokenStream
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
if (tokenStreams == null)
|
||||||
|
{
|
||||||
|
throw new AlreadyClosedException("this Analyzer is closed");
|
||||||
|
}
|
||||||
|
return tokenStreams.Get();
|
||||||
|
}
|
||||||
|
set
|
||||||
|
{
|
||||||
|
if (tokenStreams == null)
|
||||||
|
{
|
||||||
|
throw new AlreadyClosedException("this Analyzer is closed");
|
||||||
|
}
|
||||||
|
tokenStreams.Set(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Obsolete()]
|
||||||
|
protected internal bool overridesTokenStreamMethod = false;
|
||||||
|
|
||||||
|
/// <deprecated> This is only present to preserve
|
||||||
|
/// back-compat of classes that subclass a core analyzer
|
||||||
|
/// and override tokenStream but not reusableTokenStream
|
||||||
|
/// </deprecated>
|
||||||
|
/// <summary>
|
||||||
|
/// Java uses Class<? extends Analyer> to constrain <typeparamref name="TClass"/> to
|
||||||
|
/// only Types that inherit from Analyzer. C# does not have a generic type class,
|
||||||
|
/// ie Type<t>. The method signature stays the same, and an exception may
|
||||||
|
/// still be thrown, if the method doesn't exist.
|
||||||
|
/// </summary>
|
||||||
|
[Obsolete("This is only present to preserve back-compat of classes that subclass a core analyzer and override tokenStream but not reusableTokenStream ")]
|
||||||
|
protected internal virtual void SetOverridesTokenStreamMethod<TClass>()
|
||||||
|
where TClass : Analyzer
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
System.Reflection.MethodInfo m = this.GetType().GetMethod("TokenStream", new[] { typeof(string), typeof(System.IO.TextReader) });
|
||||||
|
overridesTokenStreamMethod = m.DeclaringType != typeof(TClass);
|
||||||
|
}
|
||||||
|
catch (MethodAccessException)
|
||||||
|
{
|
||||||
|
// can't happen, as baseClass is subclass of Analyzer
|
||||||
|
overridesTokenStreamMethod = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary> Invoked before indexing a Fieldable instance if
|
||||||
|
/// terms have already been added to that field. This allows custom
|
||||||
|
/// analyzers to place an automatic position increment gap between
|
||||||
|
/// Fieldable instances using the same field name. The default value
|
||||||
|
/// position increment gap is 0. With a 0 position increment gap and
|
||||||
|
/// the typical default token position increment of 1, all terms in a field,
|
||||||
|
/// including across Fieldable instances, are in successive positions, allowing
|
||||||
|
/// exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="fieldName">Fieldable name being indexed.
|
||||||
|
/// </param>
|
||||||
|
/// <returns> position increment gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" />
|
||||||
|
/// </returns>
|
||||||
|
public virtual int GetPositionIncrementGap(String fieldName)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary> Just like <see cref="GetPositionIncrementGap" />, except for
|
||||||
|
/// Token offsets instead. By default this returns 1 for
|
||||||
|
/// tokenized fields and, as if the fields were joined
|
||||||
|
/// with an extra space character, and 0 for un-tokenized
|
||||||
|
/// fields. This method is only called if the field
|
||||||
|
/// produced at least one token for indexing.
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="field">the field just indexed
|
||||||
|
/// </param>
|
||||||
|
/// <returns> offset gap, added to the next token emitted from <see cref="TokenStream(String,System.IO.TextReader)" />
|
||||||
|
/// </returns>
|
||||||
|
public virtual int GetOffsetGap(IFieldable field)
|
||||||
|
{
|
||||||
|
return field.IsTokenized ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Frees persistent resources used by this Analyzer </summary>
|
||||||
|
public void Close()
|
||||||
|
{
|
||||||
|
Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
public virtual void Dispose()
|
||||||
|
{
|
||||||
|
Dispose(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected virtual void Dispose(bool disposing)
|
||||||
|
{
|
||||||
|
if (isDisposed) return;
|
||||||
|
|
||||||
|
if (disposing)
|
||||||
|
{
|
||||||
|
if (tokenStreams != null)
|
||||||
|
{
|
||||||
|
tokenStreams.Close();
|
||||||
|
tokenStreams = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
isDisposed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
105
external/Lucene.Net.Light/src/core/Analysis/BaseCharFilter.cs
vendored
Normal file
105
external/Lucene.Net.Light/src/core/Analysis/BaseCharFilter.cs
vendored
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using Lucene.Net.Support;
|
||||||
|
using Lucene.Net.Util;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// * Base utility class for implementing a <see cref="CharFilter" />.
|
||||||
|
/// * You subclass this, and then record mappings by calling
|
||||||
|
/// * <see cref="AddOffCorrectMap" />, and then invoke the correct
|
||||||
|
/// * method to correct an offset.
|
||||||
|
/// </summary>
|
||||||
|
public abstract class BaseCharFilter : CharFilter
|
||||||
|
{
|
||||||
|
|
||||||
|
private int[] offsets;
|
||||||
|
private int[] diffs;
|
||||||
|
private int size = 0;
|
||||||
|
|
||||||
|
protected BaseCharFilter(CharStream @in) : base(@in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve the corrected offset. */
|
||||||
|
//@Override
|
||||||
|
protected internal override int Correct(int currentOff)
|
||||||
|
{
|
||||||
|
if (offsets == null || currentOff < offsets[0])
|
||||||
|
{
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
int hi = size - 1;
|
||||||
|
if (currentOff >= offsets[hi])
|
||||||
|
return currentOff + diffs[hi];
|
||||||
|
|
||||||
|
int lo = 0;
|
||||||
|
int mid = -1;
|
||||||
|
|
||||||
|
while (hi >= lo)
|
||||||
|
{
|
||||||
|
mid = Number.URShift(lo + hi, 1);
|
||||||
|
if (currentOff < offsets[mid])
|
||||||
|
hi = mid - 1;
|
||||||
|
else if (currentOff > offsets[mid])
|
||||||
|
lo = mid + 1;
|
||||||
|
else
|
||||||
|
return currentOff + diffs[mid];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentOff < offsets[mid])
|
||||||
|
return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
|
||||||
|
return currentOff + diffs[mid];
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int LastCumulativeDiff
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
return offsets == null ? 0 : diffs[size - 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Obsolete("Use LastCumulativeDiff property instead")]
|
||||||
|
protected int GetLastCumulativeDiff()
|
||||||
|
{
|
||||||
|
return LastCumulativeDiff;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void AddOffCorrectMap(int off, int cumulativeDiff)
|
||||||
|
{
|
||||||
|
if (offsets == null)
|
||||||
|
{
|
||||||
|
offsets = new int[64];
|
||||||
|
diffs = new int[64];
|
||||||
|
}
|
||||||
|
else if (size == offsets.Length)
|
||||||
|
{
|
||||||
|
offsets = ArrayUtil.Grow(offsets);
|
||||||
|
diffs = ArrayUtil.Grow(diffs);
|
||||||
|
}
|
||||||
|
|
||||||
|
offsets[size] = off;
|
||||||
|
diffs[size++] = cumulativeDiff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
86
external/Lucene.Net.Light/src/core/Analysis/CachingTokenFilter.cs
vendored
Normal file
86
external/Lucene.Net.Light/src/core/Analysis/CachingTokenFilter.cs
vendored
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> This class can be used if the token attributes of a TokenStream
|
||||||
|
/// are intended to be consumed more than once. It caches
|
||||||
|
/// all token attribute states locally in a List.
|
||||||
|
///
|
||||||
|
/// <p/>CachingTokenFilter implements the optional method
|
||||||
|
/// <see cref="TokenStream.Reset()" />, which repositions the
|
||||||
|
/// stream to the first Token.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class CachingTokenFilter : TokenFilter
|
||||||
|
{
|
||||||
|
private System.Collections.Generic.LinkedList<State> cache = null;
|
||||||
|
private System.Collections.Generic.IEnumerator<State> iterator = null;
|
||||||
|
private State finalState;
|
||||||
|
|
||||||
|
public CachingTokenFilter(TokenStream input):base(input)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
if (cache == null)
|
||||||
|
{
|
||||||
|
// fill cache lazily
|
||||||
|
cache = new System.Collections.Generic.LinkedList<State>();
|
||||||
|
FillCache();
|
||||||
|
iterator = cache.GetEnumerator();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!iterator.MoveNext())
|
||||||
|
{
|
||||||
|
// the cache is exhausted, return false
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||||
|
RestoreState(iterator.Current);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void End()
|
||||||
|
{
|
||||||
|
if (finalState != null)
|
||||||
|
{
|
||||||
|
RestoreState(finalState);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void Reset()
|
||||||
|
{
|
||||||
|
if (cache != null)
|
||||||
|
{
|
||||||
|
iterator = cache.GetEnumerator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void FillCache()
|
||||||
|
{
|
||||||
|
while (input.IncrementToken())
|
||||||
|
{
|
||||||
|
cache.AddLast(CaptureState());
|
||||||
|
}
|
||||||
|
// capture final state
|
||||||
|
input.End();
|
||||||
|
finalState = CaptureState();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
517
external/Lucene.Net.Light/src/core/Analysis/CharArraySet.cs
vendored
Normal file
517
external/Lucene.Net.Light/src/core/Analysis/CharArraySet.cs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
95
external/Lucene.Net.Light/src/core/Analysis/CharFilter.cs
vendored
Normal file
95
external/Lucene.Net.Light/src/core/Analysis/CharFilter.cs
vendored
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> Subclasses of CharFilter can be chained to filter CharStream.
|
||||||
|
/// They can be used as <see cref="System.IO.TextReader" /> with additional offset
|
||||||
|
/// correction. <see cref="Tokenizer" />s will automatically use <see cref="CorrectOffset" />
|
||||||
|
/// if a CharFilter/CharStream subclass is used.
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <version> $Id$
|
||||||
|
///
|
||||||
|
/// </version>
|
||||||
|
public abstract class CharFilter : CharStream
|
||||||
|
{
|
||||||
|
private long currentPosition = -1;
|
||||||
|
private bool isDisposed;
|
||||||
|
protected internal CharStream input;
|
||||||
|
|
||||||
|
protected internal CharFilter(CharStream in_Renamed) : base(in_Renamed)
|
||||||
|
{
|
||||||
|
input = in_Renamed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Subclass may want to override to correct the current offset.</summary>
|
||||||
|
/// <param name="currentOff">current offset</param>
|
||||||
|
/// <returns>corrected offset</returns>
|
||||||
|
protected internal virtual int Correct(int currentOff)
|
||||||
|
{
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary> Chains the corrected offset through the input
|
||||||
|
/// CharFilter.
|
||||||
|
/// </summary>
|
||||||
|
public override int CorrectOffset(int currentOff)
|
||||||
|
{
|
||||||
|
return input.CorrectOffset(Correct(currentOff));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override void Dispose(bool disposing)
|
||||||
|
{
|
||||||
|
if (isDisposed) return;
|
||||||
|
|
||||||
|
if (disposing)
|
||||||
|
{
|
||||||
|
if (input != null)
|
||||||
|
{
|
||||||
|
input.Close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
input = null;
|
||||||
|
isDisposed = true;
|
||||||
|
base.Dispose(disposing);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override int Read(System.Char[] cbuf, int off, int len)
|
||||||
|
{
|
||||||
|
return input.Read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool MarkSupported()
|
||||||
|
{
|
||||||
|
return input.BaseStream.CanSeek;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Mark(int readAheadLimit)
|
||||||
|
{
|
||||||
|
currentPosition = input.BaseStream.Position;
|
||||||
|
input.BaseStream.Position = readAheadLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Reset()
|
||||||
|
{
|
||||||
|
input.BaseStream.Position = currentPosition;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
94
external/Lucene.Net.Light/src/core/Analysis/CharReader.cs
vendored
Normal file
94
external/Lucene.Net.Light/src/core/Analysis/CharReader.cs
vendored
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> CharReader is a Reader wrapper. It reads chars from
|
||||||
|
/// Reader and outputs <see cref="CharStream" />, defining an
|
||||||
|
/// identify function <see cref="CorrectOffset" /> method that
|
||||||
|
/// simply returns the provided offset.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class CharReader:CharStream
|
||||||
|
{
|
||||||
|
private long currentPosition = -1;
|
||||||
|
|
||||||
|
private bool isDisposed;
|
||||||
|
|
||||||
|
internal System.IO.StreamReader input;
|
||||||
|
|
||||||
|
public static CharStream Get(System.IO.TextReader input)
|
||||||
|
{
|
||||||
|
var charStream = input as CharStream;
|
||||||
|
if (charStream != null)
|
||||||
|
return charStream;
|
||||||
|
|
||||||
|
// {{Aroush-2.9}} isn't there a better (faster) way to do this?
|
||||||
|
var theString = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(input.ReadToEnd()));
|
||||||
|
return new CharReader(new System.IO.StreamReader(theString));
|
||||||
|
//return input is CharStream?(CharStream) input:new CharReader(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharReader(System.IO.StreamReader in_Renamed) : base(in_Renamed)
|
||||||
|
{
|
||||||
|
input = in_Renamed;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override int CorrectOffset(int currentOff)
|
||||||
|
{
|
||||||
|
return currentOff;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override void Dispose(bool disposing)
|
||||||
|
{
|
||||||
|
if (isDisposed) return;
|
||||||
|
|
||||||
|
if (disposing)
|
||||||
|
{
|
||||||
|
if (input != null)
|
||||||
|
{
|
||||||
|
input.Close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
input = null;
|
||||||
|
isDisposed = true;
|
||||||
|
base.Dispose(disposing);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override int Read(System.Char[] cbuf, int off, int len)
|
||||||
|
{
|
||||||
|
return input.Read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool MarkSupported()
|
||||||
|
{
|
||||||
|
return input.BaseStream.CanSeek;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Mark(int readAheadLimit)
|
||||||
|
{
|
||||||
|
currentPosition = input.BaseStream.Position;
|
||||||
|
input.BaseStream.Position = readAheadLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Reset()
|
||||||
|
{
|
||||||
|
input.BaseStream.Position = currentPosition;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
45
external/Lucene.Net.Light/src/core/Analysis/CharStream.cs
vendored
Normal file
45
external/Lucene.Net.Light/src/core/Analysis/CharStream.cs
vendored
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> CharStream adds <see cref="CorrectOffset" />
|
||||||
|
/// functionality over <see cref="System.IO.TextReader" />. All Tokenizers accept a
|
||||||
|
/// CharStream instead of <see cref="System.IO.TextReader" /> as input, which enables
|
||||||
|
/// arbitrary character based filtering before tokenization.
|
||||||
|
/// The <see cref="CorrectOffset" /> method fixed offsets to account for
|
||||||
|
/// removal or insertion of characters, so that the offsets
|
||||||
|
/// reported in the tokens match the character offsets of the
|
||||||
|
/// original Reader.
|
||||||
|
/// </summary>
|
||||||
|
public abstract class CharStream : System.IO.StreamReader
|
||||||
|
{
|
||||||
|
protected CharStream(System.IO.StreamReader reader) : base(reader.BaseStream)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary> Called by CharFilter(s) and Tokenizer to correct token offset.
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="currentOff">offset as seen in the output
|
||||||
|
/// </param>
|
||||||
|
/// <returns> corrected offset based on the input
|
||||||
|
/// </returns>
|
||||||
|
public abstract int CorrectOffset(int currentOff);
|
||||||
|
}
|
||||||
|
}
|
135
external/Lucene.Net.Light/src/core/Analysis/CharTokenizer.cs
vendored
Normal file
135
external/Lucene.Net.Light/src/core/Analysis/CharTokenizer.cs
vendored
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
|
using AttributeSource = Lucene.Net.Util.AttributeSource;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>An abstract base class for simple, character-oriented tokenizers.</summary>
|
||||||
|
public abstract class CharTokenizer:Tokenizer
|
||||||
|
{
|
||||||
|
protected CharTokenizer(System.IO.TextReader input):base(input)
|
||||||
|
{
|
||||||
|
offsetAtt = AddAttribute<IOffsetAttribute>();
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected CharTokenizer(AttributeSource source, System.IO.TextReader input):base(source, input)
|
||||||
|
{
|
||||||
|
offsetAtt = AddAttribute<IOffsetAttribute>();
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected CharTokenizer(AttributeFactory factory, System.IO.TextReader input):base(factory, input)
|
||||||
|
{
|
||||||
|
offsetAtt = AddAttribute<IOffsetAttribute>();
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||||
|
private const int MAX_WORD_LEN = 255;
|
||||||
|
private const int IO_BUFFER_SIZE = 4096;
|
||||||
|
private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||||
|
|
||||||
|
private readonly ITermAttribute termAtt;
|
||||||
|
private readonly IOffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
/// <summary>Returns true iff a character should be included in a token. This
|
||||||
|
/// tokenizer generates as tokens adjacent sequences of characters which
|
||||||
|
/// satisfy this predicate. Characters for which this is false are used to
|
||||||
|
/// define token boundaries and are not included in tokens.
|
||||||
|
/// </summary>
|
||||||
|
protected internal abstract bool IsTokenChar(char c);
|
||||||
|
|
||||||
|
/// <summary>Called on each token character to normalize it before it is added to the
|
||||||
|
/// token. The default implementation does nothing. Subclasses may use this
|
||||||
|
/// to, e.g., lowercase tokens.
|
||||||
|
/// </summary>
|
||||||
|
protected internal virtual char Normalize(char c)
|
||||||
|
{
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
ClearAttributes();
|
||||||
|
int length = 0;
|
||||||
|
int start = bufferIndex;
|
||||||
|
char[] buffer = termAtt.TermBuffer();
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (bufferIndex >= dataLen)
|
||||||
|
{
|
||||||
|
offset += dataLen;
|
||||||
|
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
|
||||||
|
if (dataLen <= 0)
|
||||||
|
{
|
||||||
|
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||||
|
if (length > 0)
|
||||||
|
break;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
bufferIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char c = ioBuffer[bufferIndex++];
|
||||||
|
|
||||||
|
if (IsTokenChar(c))
|
||||||
|
{
|
||||||
|
// if it's a token char
|
||||||
|
|
||||||
|
if (length == 0)
|
||||||
|
// start of token
|
||||||
|
start = offset + bufferIndex - 1;
|
||||||
|
else if (length == buffer.Length)
|
||||||
|
buffer = termAtt.ResizeTermBuffer(1 + length);
|
||||||
|
|
||||||
|
buffer[length++] = Normalize(c); // buffer it, normalized
|
||||||
|
|
||||||
|
if (length == MAX_WORD_LEN)
|
||||||
|
// buffer overflow!
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (length > 0)
|
||||||
|
// at non-Letter w/ chars
|
||||||
|
break; // return 'em
|
||||||
|
}
|
||||||
|
|
||||||
|
termAtt.SetTermLength(length);
|
||||||
|
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void End()
|
||||||
|
{
|
||||||
|
// set final offset
|
||||||
|
int finalOffset = CorrectOffset(offset);
|
||||||
|
offsetAtt.SetOffset(finalOffset, finalOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void Reset(System.IO.TextReader input)
|
||||||
|
{
|
||||||
|
base.Reset(input);
|
||||||
|
bufferIndex = 0;
|
||||||
|
offset = 0;
|
||||||
|
dataLen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
344
external/Lucene.Net.Light/src/core/Analysis/ISOLatin1AccentFilter.cs
vendored
Normal file
344
external/Lucene.Net.Light/src/core/Analysis/ISOLatin1AccentFilter.cs
vendored
Normal file
@ -0,0 +1,344 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> A filter that replaces accented characters in the ISO Latin 1 character set
|
||||||
|
/// (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
|
||||||
|
/// <p/>
|
||||||
|
/// For instance, 'À' will be replaced by 'a'.
|
||||||
|
/// <p/>
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <deprecated> If you build a new index, use <see cref="ASCIIFoldingFilter"/>
|
||||||
|
/// which covers a superset of Latin 1.
|
||||||
|
/// This class is included for use with existing indexes and will be removed
|
||||||
|
/// in a future release (possible Lucene 4.0)
|
||||||
|
/// </deprecated>
|
||||||
|
[Obsolete("If you build a new index, use ASCIIFoldingFilter which covers a superset of Latin 1. This class is included for use with existing indexes and will be removed in a future release (possible Lucene 4.0).")]
|
||||||
|
public class ISOLatin1AccentFilter : TokenFilter
|
||||||
|
{
|
||||||
|
public ISOLatin1AccentFilter(TokenStream input):base(input)
|
||||||
|
{
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
private char[] output = new char[256];
|
||||||
|
private int outputPos;
|
||||||
|
private readonly ITermAttribute termAtt;
|
||||||
|
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
if (input.IncrementToken())
|
||||||
|
{
|
||||||
|
char[] buffer = termAtt.TermBuffer();
|
||||||
|
int length = termAtt.TermLength();
|
||||||
|
// If no characters actually require rewriting then we
|
||||||
|
// just return token as-is:
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
{
|
||||||
|
char c = buffer[i];
|
||||||
|
if (c >= '\u00c0' && c <= '\uFB06')
|
||||||
|
{
|
||||||
|
RemoveAccents(buffer, length);
|
||||||
|
termAtt.SetTermBuffer(output, 0, outputPos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary> To replace accented characters in a String by unaccented equivalents.</summary>
|
||||||
|
public void RemoveAccents(char[] input, int length)
|
||||||
|
{
|
||||||
|
|
||||||
|
// Worst-case length required:
|
||||||
|
int maxSizeNeeded = 2 * length;
|
||||||
|
|
||||||
|
int size = output.Length;
|
||||||
|
while (size < maxSizeNeeded)
|
||||||
|
size *= 2;
|
||||||
|
|
||||||
|
if (size != output.Length)
|
||||||
|
output = new char[size];
|
||||||
|
|
||||||
|
outputPos = 0;
|
||||||
|
|
||||||
|
int pos = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++, pos++)
|
||||||
|
{
|
||||||
|
char c = input[pos];
|
||||||
|
|
||||||
|
// Quick test: if it's not in range then just keep
|
||||||
|
// current character
|
||||||
|
if (c < '\u00c0' || c > '\uFB06')
|
||||||
|
output[outputPos++] = c;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
|
||||||
|
case '\u00C0':
|
||||||
|
// À
|
||||||
|
case '\u00C1':
|
||||||
|
// �?
|
||||||
|
case '\u00C2':
|
||||||
|
// Â
|
||||||
|
case '\u00C3':
|
||||||
|
// Ã
|
||||||
|
case '\u00C4':
|
||||||
|
// Ä
|
||||||
|
case '\u00C5': // Ã…
|
||||||
|
output[outputPos++] = 'A';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00C6': // Æ
|
||||||
|
output[outputPos++] = 'A';
|
||||||
|
output[outputPos++] = 'E';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00C7': // Ç
|
||||||
|
output[outputPos++] = 'C';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00C8':
|
||||||
|
// È
|
||||||
|
case '\u00C9':
|
||||||
|
// É
|
||||||
|
case '\u00CA':
|
||||||
|
// Ê
|
||||||
|
case '\u00CB': // Ë
|
||||||
|
output[outputPos++] = 'E';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00CC':
|
||||||
|
// Ì
|
||||||
|
case '\u00CD':
|
||||||
|
// �?
|
||||||
|
case '\u00CE':
|
||||||
|
// ÃŽ
|
||||||
|
case '\u00CF': // �?
|
||||||
|
output[outputPos++] = 'I';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u0132': // IJ
|
||||||
|
output[outputPos++] = 'I';
|
||||||
|
output[outputPos++] = 'J';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00D0': // �?
|
||||||
|
output[outputPos++] = 'D';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00D1': // Ñ
|
||||||
|
output[outputPos++] = 'N';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00D2':
|
||||||
|
// Ã’
|
||||||
|
case '\u00D3':
|
||||||
|
// Ó
|
||||||
|
case '\u00D4':
|
||||||
|
// Ô
|
||||||
|
case '\u00D5':
|
||||||
|
// Õ
|
||||||
|
case '\u00D6':
|
||||||
|
// Ö
|
||||||
|
case '\u00D8': // Ø
|
||||||
|
output[outputPos++] = 'O';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u0152': // Å’
|
||||||
|
output[outputPos++] = 'O';
|
||||||
|
output[outputPos++] = 'E';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00DE': // Þ
|
||||||
|
output[outputPos++] = 'T';
|
||||||
|
output[outputPos++] = 'H';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00D9':
|
||||||
|
// Ù
|
||||||
|
case '\u00DA':
|
||||||
|
// Ú
|
||||||
|
case '\u00DB':
|
||||||
|
// Û
|
||||||
|
case '\u00DC': // Ü
|
||||||
|
output[outputPos++] = 'U';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00DD':
|
||||||
|
// �?
|
||||||
|
case '\u0178': // Ÿ
|
||||||
|
output[outputPos++] = 'Y';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00E0':
|
||||||
|
// Ã
|
||||||
|
case '\u00E1':
|
||||||
|
// á
|
||||||
|
case '\u00E2':
|
||||||
|
// â
|
||||||
|
case '\u00E3':
|
||||||
|
// ã
|
||||||
|
case '\u00E4':
|
||||||
|
// ä
|
||||||
|
case '\u00E5': // å
|
||||||
|
output[outputPos++] = 'a';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00E6': // æ
|
||||||
|
output[outputPos++] = 'a';
|
||||||
|
output[outputPos++] = 'e';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00E7': // ç
|
||||||
|
output[outputPos++] = 'c';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00E8':
|
||||||
|
// è
|
||||||
|
case '\u00E9':
|
||||||
|
// é
|
||||||
|
case '\u00EA':
|
||||||
|
// ê
|
||||||
|
case '\u00EB': // ë
|
||||||
|
output[outputPos++] = 'e';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00EC':
|
||||||
|
// ì
|
||||||
|
case '\u00ED':
|
||||||
|
// Ã
|
||||||
|
case '\u00EE':
|
||||||
|
// î
|
||||||
|
case '\u00EF': // ï
|
||||||
|
output[outputPos++] = 'i';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u0133': // ij
|
||||||
|
output[outputPos++] = 'i';
|
||||||
|
output[outputPos++] = 'j';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00F0': // ð
|
||||||
|
output[outputPos++] = 'd';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00F1': // ñ
|
||||||
|
output[outputPos++] = 'n';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00F2':
|
||||||
|
// ò
|
||||||
|
case '\u00F3':
|
||||||
|
// ó
|
||||||
|
case '\u00F4':
|
||||||
|
// ô
|
||||||
|
case '\u00F5':
|
||||||
|
// õ
|
||||||
|
case '\u00F6':
|
||||||
|
// ö
|
||||||
|
case '\u00F8': // ø
|
||||||
|
output[outputPos++] = 'o';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u0153': // Å“
|
||||||
|
output[outputPos++] = 'o';
|
||||||
|
output[outputPos++] = 'e';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00DF': // ß
|
||||||
|
output[outputPos++] = 's';
|
||||||
|
output[outputPos++] = 's';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00FE': // þ
|
||||||
|
output[outputPos++] = 't';
|
||||||
|
output[outputPos++] = 'h';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00F9':
|
||||||
|
// ù
|
||||||
|
case '\u00FA':
|
||||||
|
// ú
|
||||||
|
case '\u00FB':
|
||||||
|
// û
|
||||||
|
case '\u00FC': // ü
|
||||||
|
output[outputPos++] = 'u';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\u00FD':
|
||||||
|
// ý
|
||||||
|
case '\u00FF': // ÿ
|
||||||
|
output[outputPos++] = 'y';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\uFB00': // ff
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\uFB01': // �?
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'i';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\uFB02': // fl
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'l';
|
||||||
|
break;
|
||||||
|
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
|
||||||
|
// case '\uFB03': // ffi
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'i';
|
||||||
|
// break;
|
||||||
|
// case '\uFB04': // ffl
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'l';
|
||||||
|
// break;
|
||||||
|
|
||||||
|
case '\uFB05': // ſt
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 't';
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\uFB06': // st
|
||||||
|
output[outputPos++] = 's';
|
||||||
|
output[outputPos++] = 't';
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
output[outputPos++] = c;
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
54
external/Lucene.Net.Light/src/core/Analysis/KeywordAnalyzer.cs
vendored
Normal file
54
external/Lucene.Net.Light/src/core/Analysis/KeywordAnalyzer.cs
vendored
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> "Tokenizes" the entire stream as a single token. This is useful
|
||||||
|
/// for data like zip codes, ids, and some product names.
|
||||||
|
/// </summary>
|
||||||
|
public class KeywordAnalyzer:Analyzer
|
||||||
|
{
|
||||||
|
public KeywordAnalyzer()
|
||||||
|
{
|
||||||
|
SetOverridesTokenStreamMethod<KeywordAnalyzer>();
|
||||||
|
}
|
||||||
|
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
|
||||||
|
{
|
||||||
|
return new KeywordTokenizer(reader);
|
||||||
|
}
|
||||||
|
public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
|
||||||
|
{
|
||||||
|
if (overridesTokenStreamMethod)
|
||||||
|
{
|
||||||
|
// LUCENE-1678: force fallback to tokenStream() if we
|
||||||
|
// have been subclassed and that subclass overrides
|
||||||
|
// tokenStream but not reusableTokenStream
|
||||||
|
return TokenStream(fieldName, reader);
|
||||||
|
}
|
||||||
|
var tokenizer = (Tokenizer) PreviousTokenStream;
|
||||||
|
if (tokenizer == null)
|
||||||
|
{
|
||||||
|
tokenizer = new KeywordTokenizer(reader);
|
||||||
|
PreviousTokenStream = tokenizer;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
tokenizer.Reset(reader);
|
||||||
|
return tokenizer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
99
external/Lucene.Net.Light/src/core/Analysis/KeywordTokenizer.cs
vendored
Normal file
99
external/Lucene.Net.Light/src/core/Analysis/KeywordTokenizer.cs
vendored
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
|
using AttributeSource = Lucene.Net.Util.AttributeSource;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> Emits the entire input as a single token.</summary>
|
||||||
|
public sealed class KeywordTokenizer:Tokenizer
|
||||||
|
{
|
||||||
|
|
||||||
|
private const int DEFAULT_BUFFER_SIZE = 256;
|
||||||
|
|
||||||
|
private bool done;
|
||||||
|
private int finalOffset;
|
||||||
|
private ITermAttribute termAtt;
|
||||||
|
private IOffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
public KeywordTokenizer(System.IO.TextReader input):this(input, DEFAULT_BUFFER_SIZE)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public KeywordTokenizer(System.IO.TextReader input, int bufferSize):base(input)
|
||||||
|
{
|
||||||
|
Init(bufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public KeywordTokenizer(AttributeSource source, System.IO.TextReader input, int bufferSize):base(source, input)
|
||||||
|
{
|
||||||
|
Init(bufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public KeywordTokenizer(AttributeFactory factory, System.IO.TextReader input, int bufferSize):base(factory, input)
|
||||||
|
{
|
||||||
|
Init(bufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void Init(int bufferSize)
|
||||||
|
{
|
||||||
|
this.done = false;
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
offsetAtt = AddAttribute<IOffsetAttribute>();
|
||||||
|
termAtt.ResizeTermBuffer(bufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
if (!done)
|
||||||
|
{
|
||||||
|
ClearAttributes();
|
||||||
|
done = true;
|
||||||
|
int upto = 0;
|
||||||
|
char[] buffer = termAtt.TermBuffer();
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
int length = input.Read(buffer, upto, buffer.Length - upto);
|
||||||
|
if (length == 0)
|
||||||
|
break;
|
||||||
|
upto += length;
|
||||||
|
if (upto == buffer.Length)
|
||||||
|
buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
|
||||||
|
}
|
||||||
|
termAtt.SetTermLength(upto);
|
||||||
|
finalOffset = CorrectOffset(upto);
|
||||||
|
offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void End()
|
||||||
|
{
|
||||||
|
// set final offset
|
||||||
|
offsetAtt.SetOffset(finalOffset, finalOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public override void Reset(System.IO.TextReader input)
|
||||||
|
{
|
||||||
|
base.Reset(input);
|
||||||
|
this.done = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
60
external/Lucene.Net.Light/src/core/Analysis/LengthFilter.cs
vendored
Normal file
60
external/Lucene.Net.Light/src/core/Analysis/LengthFilter.cs
vendored
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>Removes words that are too long or too short from the stream.</summary>
|
||||||
|
public sealed class LengthFilter:TokenFilter
|
||||||
|
{
|
||||||
|
|
||||||
|
internal int min;
|
||||||
|
internal int max;
|
||||||
|
|
||||||
|
private readonly ITermAttribute termAtt;
|
||||||
|
|
||||||
|
/// <summary> Build a filter that removes words that are too long or too
|
||||||
|
/// short from the text.
|
||||||
|
/// </summary>
|
||||||
|
public LengthFilter(TokenStream in_Renamed, int min, int max)
|
||||||
|
: base(in_Renamed)
|
||||||
|
{
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary> Returns the next input Token whose term() is the right len</summary>
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
// return the first non-stop word found
|
||||||
|
while (input.IncrementToken())
|
||||||
|
{
|
||||||
|
var len = termAtt.TermLength();
|
||||||
|
if (len >= min && len <= max)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// note: else we ignore it but should we index each part of it?
|
||||||
|
}
|
||||||
|
// reached EOS -- return false
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
57
external/Lucene.Net.Light/src/core/Analysis/LetterTokenizer.cs
vendored
Normal file
57
external/Lucene.Net.Light/src/core/Analysis/LetterTokenizer.cs
vendored
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using AttributeSource = Lucene.Net.Util.AttributeSource;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>A LetterTokenizer is a tokenizer that divides text at non-letters. That's
|
||||||
|
/// to say, it defines tokens as maximal strings of adjacent letters, as defined
|
||||||
|
/// by java.lang.Character.isLetter() predicate.
|
||||||
|
/// Note: this does a decent job for most European languages, but does a terrible
|
||||||
|
/// job for some Asian languages, where words are not separated by spaces.
|
||||||
|
/// </summary>
|
||||||
|
|
||||||
|
public class LetterTokenizer:CharTokenizer
|
||||||
|
{
|
||||||
|
/// <summary>Construct a new LetterTokenizer. </summary>
|
||||||
|
public LetterTokenizer(System.IO.TextReader @in):base(@in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Construct a new LetterTokenizer using a given <see cref="AttributeSource" />. </summary>
|
||||||
|
public LetterTokenizer(AttributeSource source, System.IO.TextReader @in)
|
||||||
|
: base(source, @in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Construct a new LetterTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary>
|
||||||
|
public LetterTokenizer(AttributeFactory factory, System.IO.TextReader @in)
|
||||||
|
: base(factory, @in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Collects only characters which satisfy
|
||||||
|
/// <see cref="char.IsLetter(char)" />.
|
||||||
|
/// </summary>
|
||||||
|
protected internal override bool IsTokenChar(char c)
|
||||||
|
{
|
||||||
|
return System.Char.IsLetter(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
49
external/Lucene.Net.Light/src/core/Analysis/LowerCaseFilter.cs
vendored
Normal file
49
external/Lucene.Net.Light/src/core/Analysis/LowerCaseFilter.cs
vendored
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using Lucene.Net.Analysis.Tokenattributes;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>Normalizes token text to lower case.</summary>
|
||||||
|
public sealed class LowerCaseFilter:TokenFilter
|
||||||
|
{
|
||||||
|
public LowerCaseFilter(TokenStream @in)
|
||||||
|
: base(@in)
|
||||||
|
{
|
||||||
|
termAtt = AddAttribute<ITermAttribute>();
|
||||||
|
}
|
||||||
|
|
||||||
|
private readonly ITermAttribute termAtt;
|
||||||
|
|
||||||
|
public override bool IncrementToken()
|
||||||
|
{
|
||||||
|
if (input.IncrementToken())
|
||||||
|
{
|
||||||
|
|
||||||
|
char[] buffer = termAtt.TermBuffer();
|
||||||
|
int length = termAtt.TermLength();
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
buffer[i] = System.Char.ToLower(buffer[i]);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
60
external/Lucene.Net.Light/src/core/Analysis/LowerCaseTokenizer.cs
vendored
Normal file
60
external/Lucene.Net.Light/src/core/Analysis/LowerCaseTokenizer.cs
vendored
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
using AttributeSource = Lucene.Net.Util.AttributeSource;
|
||||||
|
|
||||||
|
namespace Lucene.Net.Analysis
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary> LowerCaseTokenizer performs the function of LetterTokenizer
|
||||||
|
/// and LowerCaseFilter together. It divides text at non-letters and converts
|
||||||
|
/// them to lower case. While it is functionally equivalent to the combination
|
||||||
|
/// of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||||
|
/// to doing the two tasks at once, hence this (redundant) implementation.
|
||||||
|
/// <p/>
|
||||||
|
/// Note: this does a decent job for most European languages, but does a terrible
|
||||||
|
/// job for some Asian languages, where words are not separated by spaces.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class LowerCaseTokenizer:LetterTokenizer
|
||||||
|
{
|
||||||
|
/// <summary>Construct a new LowerCaseTokenizer. </summary>
|
||||||
|
public LowerCaseTokenizer(System.IO.TextReader @in)
|
||||||
|
: base(@in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Construct a new LowerCaseTokenizer using a given <see cref="AttributeSource" />. </summary>
|
||||||
|
public LowerCaseTokenizer(AttributeSource source, System.IO.TextReader @in)
|
||||||
|
: base(source, @in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Construct a new LowerCaseTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary>
|
||||||
|
public LowerCaseTokenizer(AttributeFactory factory, System.IO.TextReader @in)
|
||||||
|
: base(factory, @in)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Converts char to lower case
|
||||||
|
/// <see cref="char.ToLower(char)" />.
|
||||||
|
/// </summary>
|
||||||
|
protected internal override char Normalize(char c)
|
||||||
|
{
|
||||||
|
return System.Char.ToLower(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user