Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lucene_4_3_0: Port Chinese Analyzer Components #2

Open
wants to merge 4 commits into
base: lucene_4_3_0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 9 additions & 46 deletions src/contrib/Analyzers/Cn/ChineseAnalyzer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,65 +21,28 @@

using System;
using System.IO;
using System.Text;
using System.Collections;

using Lucene.Net.Analysis;

namespace Lucene.Net.Analysis.Cn
{
/// <summary>
/// An <see cref="Analyzer"/> that tokenizes text with <see cref="ChineseTokenizer"/> and
/// filters with <see cref="ChineseFilter"/>
/// </summary>
[Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardAnalyzer} instead, which has the same functionality. This analyzer will be removed in Lucene 5.0")]
public class ChineseAnalyzer : Analyzer
{

public ChineseAnalyzer()
{
}

/// <summary>
/// Creates a TokenStream which tokenizes all the text in the provided Reader.
/// </summary>
/// <returns>A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.</returns>
public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = new ChineseTokenizer(reader);
result = new ChineseFilter(result);
return result;
}

private class SavedStreams
{
protected internal Tokenizer source;
protected internal TokenStream result;
};

/// <summary>
/// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the text in the
/// provided <see cref="TextReader"/>.
/// Creates <see cref="Analyzer.TokenStreamComponents"/>
/// used to tokenize all the text in the provided <see cref="TextReader"/>.
/// </summary>
/// <returns>
/// A <see cref="TokenStream"/> built from a <see cref="ChineseTokenizer"/>
/// filtered with <see cref="ChineseFilter"/>.
/// </returns>
public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
/// <see cref="Analyzer.TokenStreamComponents"/>
/// built from a <see cref="ChineseTokenizer"/> filtered with
/// <see cref="ChineseFilter"/></returns>
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
/* tokenStream() is final, no back compat issue */
SavedStreams streams = (SavedStreams) PreviousTokenStream;
if (streams == null)
{
streams = new SavedStreams();
streams.source = new ChineseTokenizer(reader);
streams.result = new ChineseFilter(streams.source);
PreviousTokenStream = streams;
}
else
{
streams.source.Reset(reader);
}
return streams.result;
Tokenizer source = new ChineseTokenizer(reader);
return new TokenStreamComponents(source, new ChineseFilter(source));
}
}
}
39 changes: 21 additions & 18 deletions src/contrib/Analyzers/Cn/ChineseFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,34 @@
using System.IO;
using System.Collections;
using System.Globalization;
using System.Linq;

using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
using Version = Lucene.Net.Util.Version;

namespace Lucene.Net.Analysis.Cn
{
// TODO: convert this XML code to valid .NET
/// <summary>
/// A {@link TokenFilter} with a stop word table.
/// <ul>
/// <li>Numeric tokens are removed.</li>
/// <li>English tokens must be larger than 1 char.</li>
/// <li>One Chinese char as one Chinese word.</li>
/// </ul>
/// A <see cref="Lucene.Net.Analysis.TokenFilter"/> with a stop word table.
/// <list type="bullet">
/// <item><description>Numeric tokens are removed.</description></item>
/// <item><description>English tokens must be larger than 1 char.</description></item>
/// <item><description>One Chinese char as one Chinese word.</description></item>
/// </list>
/// TO DO:
/// <ol>
/// <li>Add Chinese stop words, such as \ue400</li>
/// <li>Dictionary based Chinese word extraction</li>
/// <li>Intelligent Chinese word extraction</li>
/// </ol>
/// <list type="number">
/// <item><description>Add Chinese stop words, such as \ue400</description></item>
/// <item><description>Dictionary based Chinese word extraction</description></item>
/// <item><description>Intelligent Chinese word extraction</description></item>
/// </list>
/// </summary>
[Obsolete("(3.1) Use {Lucene.Net.Analysis.Core.StopFilter} instead, which has the same functionality. This filter will be removed in Lucene 5.0")]
public sealed class ChineseFilter : TokenFilter
{
// Only English now, Chinese to be added later.
public static String[] STOP_WORDS =
public static readonly String[] STOP_WORDS =
{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
Expand All @@ -58,21 +61,21 @@ public sealed class ChineseFilter : TokenFilter
};

private CharArraySet stopTable;
private ITermAttribute termAtt;
private ICharTermAttribute termAtt;

public ChineseFilter(TokenStream _in)
: base(_in)
{
stopTable = new CharArraySet((IEnumerable<string>)STOP_WORDS, false);
termAtt = AddAttribute<ITermAttribute>();
stopTable = new CharArraySet(Version.LUCENE_CURRENT, STOP_WORDS.ToList<object>(), false);
termAtt = AddAttribute<ICharTermAttribute>();
}

public override bool IncrementToken()
{
while (input.IncrementToken())
{
char[] text = termAtt.TermBuffer();
int termLength = termAtt.TermLength();
char[] text = termAtt.Buffer;
int termLength = termAtt.Length;

// why not key off token type here assuming ChineseTokenizer comes first?
if (!stopTable.Contains(text, 0, termLength))
Expand Down
53 changes: 53 additions & 0 deletions src/contrib/Analyzers/Cn/ChineseFilterFactory.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Util;

namespace Lucene.Net.Analysis.Cn
{
/// <summary>
/// Factory for <see cref="ChineseFilter"/>
/// </summary>
[Obsolete("Use {Lucene.Net.Analysis.Core.StopFilterFactory} instead.")]
public class ChineseFilterFactory : TokenFilterFactory
{
/// <summary>
/// Creates a new ChineseFilterFactory
/// </summary>
public ChineseFilterFactory(IDictionary<string, string> args)
: base(args)
{
if (args.Count > 0)
{
throw new ArgumentException("Unknown parameters: " + args);
}
}

public override TokenStream Create(TokenStream _in)
{
return new ChineseFilter(_in);
}
}
}
60 changes: 22 additions & 38 deletions src/contrib/Analyzers/Cn/ChineseTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,52 +32,45 @@
namespace Lucene.Net.Analysis.Cn
{
/// <summary>
/// <para>
/// Tokenize Chinese text as individual chinese chars.
/// <p>
/// </para>
/// <para>
/// The difference between ChineseTokenizer and
/// CJKTokenizer is that they have different
/// token parsing logic.
/// </p>
/// <p>
/// </para>
/// <para>
/// For example, if the Chinese text
/// "C1C2C3C4" is to be indexed:
/// <ul>
/// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</li>
/// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</li>
/// </ul>
/// </p>
/// <p>
/// <list type="bullet">
/// <item><description>The tokens returned from ChineseTokenizer are C1, C2, C3, C4</description></item>
/// <item><description>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.</description></item>
/// </list>
/// </para>
/// <para>
/// Therefore the index created by CJKTokenizer is much larger.
/// </p>
/// <p>
/// </para>
/// <para>
/// The problem is that when searching for C1, C1C2, C1C3,
/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
/// CJKTokenizer will not work.
/// </p>
/// </summary>
/// </para>
/// </summary>
[Obsolete("(3.1) Use {Lucene.Net.Analysis.Standard.StandardTokenizer} instead, which has the same functionality. This filter will be removed in Lucene 5.0")]
public sealed class ChineseTokenizer : Tokenizer
{
public ChineseTokenizer(TextReader _in)
: base(_in)
{
Init();
}

public ChineseTokenizer(AttributeSource source, TextReader _in)
: base(source, _in)
{
Init();
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}

public ChineseTokenizer(AttributeFactory factory, TextReader _in)
: base(factory, _in)
{
Init();
}

private void Init()
{
termAtt = AddAttribute<ITermAttribute>();
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}

Expand All @@ -90,8 +83,8 @@ private void Init()
private int length;
private int start;

private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;

private void Push(char c)
{
Expand All @@ -101,18 +94,16 @@ private void Push(char c)

private bool Flush()
{

if (length > 0)
{
termAtt.SetTermBuffer(buffer, 0, length);
termAtt.CopyBuffer(buffer, 0, length);
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
return true;
}
else
return false;
}


public override bool IncrementToken()
{
ClearAttributes();
Expand All @@ -123,7 +114,6 @@ public override bool IncrementToken()

while (true)
{

char c;
offset++;

Expand Down Expand Up @@ -181,11 +171,5 @@ public override void Reset()
base.Reset();
offset = bufferIndex = dataLen = 0;
}

public override void Reset(TextReader input)
{
base.Reset(input);
Reset();
}
}
}
55 changes: 55 additions & 0 deletions src/contrib/Analyzers/Cn/ChineseTokenizerFactory.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;

namespace Lucene.Net.Analysis.Cn
{
/// <summary>
/// Factory for <see cref="ChineseTokenizer"/>
/// </summary>
[Obsolete("Use {Lucene.Net.Analysis.Standard.StandardTokenizerFactory} instead.")]
public class ChineseTokenizerFactory : TokenizerFactory
{
/// <summary>
/// Creates a new ChineseTokenizerFactory
/// </summary>
public ChineseTokenizerFactory(IDictionary<string, string> args)
: base(args)
{
if (args.Count > 0)
{
throw new ArgumentException("Unknown parameters: " + args);
}
}

public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader _in)
{
return new ChineseTokenizer(factory, _in);
}
}
}
Loading