/* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
using System.Collections.Generic;
using System.Runtime.Serialization;
using OpenSearch.Net;
using OpenSearch.Net.Utf8Json;
namespace OpenSearch.Client
{
/// The decompound mode determines how the tokenizer handles compound tokens.
[StringEnum]
public enum NoriDecompoundMode
{
/// Decomposes compounds and discards the original form (default).
[EnumMember(Value = "discard")]
Discard,
/// No decomposition for compounds
[EnumMember(Value = "none")]
None,
/// Decomposes compounds and keeps the original form
[EnumMember(Value = "mixed")]
Mixed
}
/// Tokenizer that ships with the analysis-nori plugin
public interface INoriTokenizer : ITokenizer
{
///
/// The regular expression pattern, defaults to \W+.
///
[DataMember(Name = "decompound_mode")]
NoriDecompoundMode? DecompoundMode { get; set; }
///
/// Whether punctuation should be discarded from the output. Defaults to `true`.
///
[DataMember(Name = "discard_punctuation")]
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
bool? DiscardPunctuation { get; set; }
///
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be
/// appended to
/// the default dictionary. This property allows you to specify this file on disk
///
[DataMember(Name = "user_dictionary")]
string UserDictionary { get; set; }
///
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG)
/// can be specified inline with this property
///
[DataMember(Name = "user_dictionary_rules")]
IEnumerable UserDictionaryRules { get; set; }
}
///
public class NoriTokenizer : TokenizerBase, INoriTokenizer
{
public NoriTokenizer() => Type = "nori_tokenizer";
///
public NoriDecompoundMode? DecompoundMode { get; set; }
///
public bool? DiscardPunctuation { get; set; }
///
public string UserDictionary { get; set; }
///
public IEnumerable UserDictionaryRules { get; set; }
}
///
public class NoriTokenizerDescriptor
: TokenizerDescriptorBase, INoriTokenizer
{
protected override string Type => "nori_tokenizer";
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }
IEnumerable INoriTokenizer.UserDictionaryRules { get; set; }
bool? INoriTokenizer.DiscardPunctuation { get; set; }
///
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);
///
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(path, (a, v) => a.UserDictionary = v);
///
public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
///
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
///
public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
}
}