/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System; using System.Collections.Generic; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { [JsonFormatter(typeof(VerbatimDictionaryKeysFormatter))] public interface ITokenizers : IIsADictionary { } public class Tokenizers : IsADictionaryBase, ITokenizers { public Tokenizers() { } public Tokenizers(IDictionary container) : base(container) { } public Tokenizers(Dictionary container) : base(container) { } public void Add(string name, ITokenizer analyzer) => BackingDictionary.Add(name, analyzer); } public class TokenizersDescriptor : IsADictionaryDescriptorBase { public TokenizersDescriptor() : base(new Tokenizers()) { } public TokenizersDescriptor UserDefined(string name, ITokenizer analyzer) => Assign(name, analyzer); ///

/// A tokenizer of type edgeNGram. ///

public TokenizersDescriptor EdgeNGram(string name, Func selector) => Assign(name, selector?.Invoke(new EdgeNGramTokenizerDescriptor())); ///

/// A tokenizer of type nGram. ///

public TokenizersDescriptor NGram(string name, Func selector) => Assign(name, selector?.Invoke(new NGramTokenizerDescriptor())); ///

/// A tokenizer of type keyword that emits the entire input as a single input. ///

public TokenizersDescriptor Keyword(string name, Func selector) => Assign(name, selector?.Invoke(new KeywordTokenizerDescriptor())); ///

/// A tokenizer of type letter that divides text at non-letters. That’s to say, it defines tokens as maximal strings of /// adjacent letters. /// /// Note, this does a decent job for most European languages, but does a terrible job for some Asian languages, where words /// are not /// separated by spaces. /// ///

public TokenizersDescriptor Letter(string name, Func selector) => Assign(name, selector?.Invoke(new LetterTokenizerDescriptor())); ///

/// A tokenizer of type lowercase that performs the function of Letter Tokenizer and Lower Case Token Filter together. /// It divides text at non-letters and converts them to lower case. /// While it is functionally equivalent to the combination of Letter Tokenizer and Lower Case Token Filter, /// there is a performance advantage to doing the two tasks at once, hence this (redundant) implementation. ///

public TokenizersDescriptor Lowercase(string name, Func selector) => Assign(name, selector?.Invoke(new LowercaseTokenizerDescriptor())); ///

/// The path_hierarchy tokenizer takes something like this: /// /something/something/else /// And produces tokens: /// /// /something /// /something/something /// /something/something/else ///

public TokenizersDescriptor PathHierarchy(string name, Func selector) => Assign(name, selector?.Invoke(new PathHierarchyTokenizerDescriptor())); ///

/// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. ///

public TokenizersDescriptor Pattern(string name, Func selector) => Assign(name, selector?.Invoke(new PatternTokenizerDescriptor())); ///

/// A tokenizer of type standard providing grammar based tokenizer that is a good tokenizer for most European language /// documents. /// The tokenizer implements the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29. ///

public TokenizersDescriptor Standard(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new StandardTokenizerDescriptor())); ///

/// A tokenizer of type uax_url_email which works exactly like the standard tokenizer, but tokenizes emails and urls as /// single tokens ///

public TokenizersDescriptor UaxEmailUrl(string name, Func selector) => Assign(name, selector?.Invoke(new UaxEmailUrlTokenizerDescriptor())); ///

/// A tokenizer of type whitespace that divides text at whitespace. ///

public TokenizersDescriptor Whitespace(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new WhitespaceTokenizerDescriptor())); ///

/// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. /// Part of the `analysis-kuromoji` plugin: /// ///

public TokenizersDescriptor Kuromoji(string name, Func selector) => Assign(name, selector?.Invoke(new KuromojiTokenizerDescriptor())); ///

/// Tokenizes text into words on word boundaries, as defined in UAX #29: Unicode Text Segmentation. It behaves much /// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach /// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer /// text into syllables. /// Part of the `analysis-icu` plugin: ///

public TokenizersDescriptor Icu(string name, Func selector) => Assign(name, selector?.Invoke(new IcuTokenizerDescriptor())); /// public TokenizersDescriptor Nori(string name, Func selector) => Assign(name, selector?.Invoke(new NoriTokenizerDescriptor())); /// /// > public TokenizersDescriptor CharGroup(string name, Func selector) => Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor())); } }