/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System.Collections.Generic; using System.Runtime.Serialization; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { /// /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. /// Part of the `analysis-kuromoji` plugin: /// public interface IKuromojiTokenizer : ITokenizer { /// /// Whether punctuation should be discarded from the output. Defaults to true. /// [DataMember(Name ="discard_punctuation")] [JsonFormatter(typeof(NullableStringBooleanFormatter))] bool? DiscardPunctuation { get; set; } /// /// Whether original compound tokens should be discarded from the output with /// . Defaults to `false`. /// [DataMember(Name ="discard_compound_token")] [JsonFormatter(typeof(NullableStringBooleanFormatter))] bool? DiscardCompoundToken { get; set; } /// /// The tokenization mode determines how the tokenizer handles compound and unknown words. /// [DataMember(Name ="mode")] KuromojiTokenizationMode? Mode { get; set; } /// /// The nbest_cost parameter specifies an additional Viterbi cost. The KuromojiTokenizer will include all tokens in /// Viterbi paths that are within the nbest_cost value of the best path. /// [DataMember(Name ="nbest_cost")] [JsonFormatter(typeof(NullableStringIntFormatter))] int? NBestCost { get; set; } /// /// The nbest_examples can be used to find a nbest_cost value based on examples. For example, /// a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts, 箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) /// we’d like a cost that gives is us 箱根 (Hakone) and 成田 (Narita). /// [DataMember(Name ="nbest_examples")] string NBestExamples { get; set; } /// /// The Kuromoji tokenizer uses the MeCab-IPADIC dictionary by default. A user_dictionary may be /// appended to the default dictionary. /// [DataMember(Name ="user_dictionary")] string UserDictionary { get; set; } /// Inline rule version of [DataMember(Name ="user_dictionary_rules")] IEnumerable UserDictionaryRules { get; set; } } /// public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer { public KuromojiTokenizer() => Type = "kuromoji_tokenizer"; /// public bool? DiscardPunctuation { get; set; } /// public bool? DiscardCompoundToken { get; set; } /// public KuromojiTokenizationMode? Mode { get; set; } /// public int? NBestCost { get; set; } /// public string NBestExamples { get; set; } /// public string UserDictionary { get; set; } /// public IEnumerable UserDictionaryRules { get; set; } } /// public class KuromojiTokenizerDescriptor : TokenizerDescriptorBase, IKuromojiTokenizer { protected override string Type => "kuromoji_tokenizer"; bool? IKuromojiTokenizer.DiscardPunctuation { get; set; } bool? IKuromojiTokenizer.DiscardCompoundToken { get; set; } KuromojiTokenizationMode? IKuromojiTokenizer.Mode { get; set; } int? IKuromojiTokenizer.NBestCost { get; set; } string IKuromojiTokenizer.NBestExamples { get; set; } string IKuromojiTokenizer.UserDictionary { get; set; } IEnumerable IKuromojiTokenizer.UserDictionaryRules { get; set; } /// public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(mode, (a, v) => a.Mode = v); /// public KuromojiTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v); /// public KuromojiTokenizerDescriptor DiscardCompoundToken(bool? discard = true) => Assign(discard, (a, v) => a.DiscardCompoundToken = v); /// public KuromojiTokenizerDescriptor UserDictionary(string userDictionary) => Assign(userDictionary, (a, v) => a.UserDictionary = v); /// public KuromojiTokenizerDescriptor NBestExamples(string examples) => Assign(examples, (a, v) => a.NBestExamples = v); /// public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(cost, (a, v) => a.NBestCost = v); /// public KuromojiTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules); /// public KuromojiTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules); } }