/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System.Collections.Generic; using System.Runtime.Serialization; using OpenSearch.Net; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { /// The decompound mode determines how the tokenizer handles compound tokens. [StringEnum] public enum NoriDecompoundMode { /// Decomposes compounds and discards the original form (default). [EnumMember(Value = "discard")] Discard, /// No decomposition for compounds [EnumMember(Value = "none")] None, /// Decomposes compounds and keeps the original form [EnumMember(Value = "mixed")] Mixed } /// Tokenizer that ships with the analysis-nori plugin public interface INoriTokenizer : ITokenizer { /// /// The regular expression pattern, defaults to \W+. /// [DataMember(Name = "decompound_mode")] NoriDecompoundMode? DecompoundMode { get; set; } /// /// Whether punctuation should be discarded from the output. Defaults to `true`. /// [DataMember(Name = "discard_punctuation")] [JsonFormatter(typeof(NullableStringBooleanFormatter))] bool? DiscardPunctuation { get; set; } /// /// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be /// appended to /// the default dictionary. This property allows you to specify this file on disk /// [DataMember(Name = "user_dictionary")] string UserDictionary { get; set; } /// /// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) /// can be specified inline with this property /// [DataMember(Name = "user_dictionary_rules")] IEnumerable UserDictionaryRules { get; set; } } /// public class NoriTokenizer : TokenizerBase, INoriTokenizer { public NoriTokenizer() => Type = "nori_tokenizer"; /// public NoriDecompoundMode? DecompoundMode { get; set; } /// public bool? DiscardPunctuation { get; set; } /// public string UserDictionary { get; set; } /// public IEnumerable UserDictionaryRules { get; set; } } /// public class NoriTokenizerDescriptor : TokenizerDescriptorBase, INoriTokenizer { protected override string Type => "nori_tokenizer"; NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; } string INoriTokenizer.UserDictionary { get; set; } IEnumerable INoriTokenizer.UserDictionaryRules { get; set; } bool? INoriTokenizer.DiscardPunctuation { get; set; } /// public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v); /// public NoriTokenizerDescriptor UserDictionary(string path) => Assign(path, (a, v) => a.UserDictionary = v); /// public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v); /// public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v); /// public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v); } }