/* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
using System;
namespace OpenSearch.Client.Specification.IndicesApi
{
public class AnalyzeTokenizersSelector : SelectorBase
{
///
/// A tokenizer of type edgeNGram.
///
public ITokenizer EdgeNGram(Func selector) =>
selector?.Invoke(new EdgeNGramTokenizerDescriptor());
///
/// A tokenizer of type nGram.
///
public ITokenizer NGram(Func selector) =>
selector?.Invoke(new NGramTokenizerDescriptor());
///
/// A tokenizer of type keyword that emits the entire input as a single input.
///
public ITokenizer Keyword(Func selector) =>
selector?.Invoke(new KeywordTokenizerDescriptor());
///
/// A tokenizer of type letter that divides text at non-letters. That’s to say, it defines tokens as maximal strings of adjacent letters.
///
/// Note, this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not
/// separated by spaces.
///
///
public ITokenizer Letter(Func selector) =>
selector?.Invoke(new LetterTokenizerDescriptor());
///
/// A tokenizer of type lowercase that performs the function of Letter Tokenizer and Lower Case Token Filter together.
/// It divides text at non-letters and converts them to lower case.
/// While it is functionally equivalent to the combination of Letter Tokenizer and Lower Case Token Filter,
/// there is a performance advantage to doing the two tasks at once, hence this (redundant) implementation.
///
public ITokenizer Lowercase(Func selector) =>
selector?.Invoke(new LowercaseTokenizerDescriptor());
///
/// The path_hierarchy tokenizer takes something like this:
/// /something/something/else
/// And produces tokens:
///
/// /something
/// /something/something
/// /something/something/else
///
public ITokenizer PathHierarchy(Func selector) =>
selector?.Invoke(new PathHierarchyTokenizerDescriptor());
///
/// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression.
///
public ITokenizer Pattern(Func selector) =>
selector?.Invoke(new PatternTokenizerDescriptor());
///
/// A tokenizer of type standard providing grammar based tokenizer that is a good tokenizer for most European language documents.
/// The tokenizer implements the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29.
///
public ITokenizer Standard(Func selector = null) =>
selector.InvokeOrDefault(new StandardTokenizerDescriptor());
///
/// A tokenizer of type uax_url_email which works exactly like the standard tokenizer, but tokenizes emails and urls as single tokens
///
public ITokenizer UaxEmailUrl(Func selector) =>
selector?.Invoke(new UaxEmailUrlTokenizerDescriptor());
///
/// A tokenizer of type whitespace that divides text at whitespace.
///
public ITokenizer Whitespace(Func selector = null) =>
selector.InvokeOrDefault(new WhitespaceTokenizerDescriptor());
///
/// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression.
/// Part of the `analysis-kuromoji` plugin:
///
public ITokenizer Kuromoji(Func selector) =>
selector?.Invoke(new KuromojiTokenizerDescriptor());
///
/// Tokenizes text into words on word boundaries, as defined in UAX #29: Unicode Text Segmentation. It behaves much
/// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach
/// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer
/// text into syllables.
/// Part of the `analysis-icu` plugin:
///
public ITokenizer Icu(Func selector) =>
selector?.Invoke(new IcuTokenizerDescriptor());
///
public ITokenizer Nori(Func selector) =>
selector.Invoke(new NoriTokenizerDescriptor());
///
/// >
public ITokenizer CharGroup(Func selector) =>
selector?.Invoke(new CharGroupTokenizerDescriptor());
}
}