/* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
using System.Collections.Generic;
using System.Runtime.Serialization;
using OpenSearch.Net.Utf8Json;
namespace OpenSearch.Client
{
///
/// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression.
/// Part of the `analysis-kuromoji` plugin:
///
public interface IKuromojiTokenizer : ITokenizer
{
///
/// Whether punctuation should be discarded from the output. Defaults to true.
///
[DataMember(Name ="discard_punctuation")]
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
bool? DiscardPunctuation { get; set; }
///
/// Whether original compound tokens should be discarded from the output with
/// . Defaults to `false`.
///
[DataMember(Name ="discard_compound_token")]
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
bool? DiscardCompoundToken { get; set; }
///
/// The tokenization mode determines how the tokenizer handles compound and unknown words.
///
[DataMember(Name ="mode")]
KuromojiTokenizationMode? Mode { get; set; }
///
/// The nbest_cost parameter specifies an additional Viterbi cost. The KuromojiTokenizer will include all tokens in
/// Viterbi paths that are within the nbest_cost value of the best path.
///
[DataMember(Name ="nbest_cost")]
[JsonFormatter(typeof(NullableStringIntFormatter))]
int? NBestCost { get; set; }
///
/// The nbest_examples can be used to find a nbest_cost value based on examples. For example,
/// a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts, 箱根山 (Mt. Hakone) and 成田空港 (Narita Airport)
/// we’d like a cost that gives is us 箱根 (Hakone) and 成田 (Narita).
///
[DataMember(Name ="nbest_examples")]
string NBestExamples { get; set; }
///
/// The Kuromoji tokenizer uses the MeCab-IPADIC dictionary by default. A user_dictionary may be
/// appended to the default dictionary.
///
[DataMember(Name ="user_dictionary")]
string UserDictionary { get; set; }
/// Inline rule version of
[DataMember(Name ="user_dictionary_rules")]
IEnumerable UserDictionaryRules { get; set; }
}
///
public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer
{
public KuromojiTokenizer() => Type = "kuromoji_tokenizer";
///
public bool? DiscardPunctuation { get; set; }
///
public bool? DiscardCompoundToken { get; set; }
///
public KuromojiTokenizationMode? Mode { get; set; }
///
public int? NBestCost { get; set; }
///
public string NBestExamples { get; set; }
///
public string UserDictionary { get; set; }
///
public IEnumerable UserDictionaryRules { get; set; }
}
///
public class KuromojiTokenizerDescriptor
: TokenizerDescriptorBase, IKuromojiTokenizer
{
protected override string Type => "kuromoji_tokenizer";
bool? IKuromojiTokenizer.DiscardPunctuation { get; set; }
bool? IKuromojiTokenizer.DiscardCompoundToken { get; set; }
KuromojiTokenizationMode? IKuromojiTokenizer.Mode { get; set; }
int? IKuromojiTokenizer.NBestCost { get; set; }
string IKuromojiTokenizer.NBestExamples { get; set; }
string IKuromojiTokenizer.UserDictionary { get; set; }
IEnumerable IKuromojiTokenizer.UserDictionaryRules { get; set; }
///
public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(mode, (a, v) => a.Mode = v);
///
public KuromojiTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
///
public KuromojiTokenizerDescriptor DiscardCompoundToken(bool? discard = true) => Assign(discard, (a, v) => a.DiscardCompoundToken = v);
///
public KuromojiTokenizerDescriptor UserDictionary(string userDictionary) => Assign(userDictionary, (a, v) => a.UserDictionary = v);
///
public KuromojiTokenizerDescriptor NBestExamples(string examples) => Assign(examples, (a, v) => a.NBestExamples = v);
///
public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(cost, (a, v) => a.NBestCost = v);
///
public KuromojiTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
///
public KuromojiTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
}
}