/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System; using System.Collections.Generic; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { [JsonFormatter(typeof(VerbatimDictionaryKeysFormatter))] public interface ITokenFilters : IIsADictionary { } public class TokenFilters : IsADictionaryBase, ITokenFilters { public TokenFilters() { } public TokenFilters(IDictionary container) : base(container) { } public TokenFilters(Dictionary container) : base(container) { } public void Add(string name, ITokenFilter analyzer) => BackingDictionary.Add(name, analyzer); } public class TokenFiltersDescriptor : IsADictionaryDescriptorBase { public TokenFiltersDescriptor() : base(new TokenFilters()) { } public TokenFiltersDescriptor UserDefined(string name, ITokenFilter analyzer) => Assign(name, analyzer); ///

/// Token filters that allow to decompose compound words using a dictionary ///

public TokenFiltersDescriptor DictionaryDecompounder(string name, Func selector ) => Assign(name, selector?.Invoke(new DictionaryDecompounderTokenFilterDescriptor())); ///

/// Token filters that allow to decompose compound words using FOP XML ///

public TokenFiltersDescriptor HyphenationDecompounder(string name, Func selector ) => Assign(name, selector?.Invoke(new HyphenationDecompounderTokenFilterDescriptor())); ///

/// A token filter of type edgeNGram. ///

public TokenFiltersDescriptor EdgeNGram(string name, Func selector) => Assign(name, selector?.Invoke(new EdgeNGramTokenFilterDescriptor())); ///

/// The phonetic token filter is provided as a plugin. ///

public TokenFiltersDescriptor Phonetic(string name, Func selector) => Assign(name, selector?.Invoke(new PhoneticTokenFilterDescriptor())); ///

/// A token filter of type shingle that constructs shingles (token n-grams) from a token stream. /// In other words, it creates combinations of tokens as a single token. ///

public TokenFiltersDescriptor Shingle(string name, Func selector) => Assign(name, selector?.Invoke(new ShingleTokenFilterDescriptor())); ///

/// A token filter of type stop that removes stop words from token streams. ///

public TokenFiltersDescriptor Stop(string name, Func selector) => Assign(name, selector?.Invoke(new StopTokenFilterDescriptor())); ///

/// The synonym token filter allows to easily handle synonyms during the analysis process. ///

public TokenFiltersDescriptor Synonym(string name, Func selector) => Assign(name, selector?.Invoke(new SynonymTokenFilterDescriptor())); ///

/// The synonym_graph token filter allows to easily handle synonyms, /// including multi-word synonyms correctly during the analysis process. ///

public TokenFiltersDescriptor SynonymGraph(string name, Func selector) => Assign(name, selector?.Invoke(new SynonymGraphTokenFilterDescriptor())); ///

/// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are /// /// not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one /// exists. /// ///

public TokenFiltersDescriptor WordDelimiter(string name, Func selector) => Assign(name, selector?.Invoke(new WordDelimiterTokenFilterDescriptor())); ///

public TokenFiltersDescriptor WordDelimiterGraph(string name, Func selector ) => Assign(name, selector?.Invoke(new WordDelimiterGraphTokenFilterDescriptor())); ///

public TokenFiltersDescriptor AsciiFolding(string name, Func selector) => Assign(name, selector?.Invoke(new AsciiFoldingTokenFilterDescriptor())); ///

/// Token filter that generates bigrams for frequently occuring terms. Single terms are still indexed. /// Note, common_words or common_words_path field is required. ///

public TokenFiltersDescriptor CommonGrams(string name, Func selector) => Assign(name, selector?.Invoke(new CommonGramsTokenFilterDescriptor())); ///

/// Splits tokens into tokens and payload whenever a delimiter character is found. ///

public TokenFiltersDescriptor DelimitedPayload(string name, Func selector ) => Assign(name, selector?.Invoke(new DelimitedPayloadTokenFilterDescriptor())); ///

/// A token filter which removes elisions. For example, “l’avion” (the plane) will tokenized as “avion” (plane). ///

public TokenFiltersDescriptor Elision(string name, Func selector) => Assign(name, selector?.Invoke(new ElisionTokenFilterDescriptor())); ///

/// Basic support for hunspell stemming. /// Hunspell dictionaries will be picked up from a dedicated hunspell directory on the filesystem. ///

public TokenFiltersDescriptor Hunspell(string name, Func selector) => Assign(name, selector?.Invoke(new HunspellTokenFilterDescriptor())); ///

/// A token filter of type keep that only keeps tokens with text contained in a predefined set of words. ///

public TokenFiltersDescriptor KeepTypes(string name, Func selector) => Assign(name, selector?.Invoke(new KeepTypesTokenFilterDescriptor())); ///

/// A token filter of type keep that only keeps tokens with text contained in a predefined set of words. ///

public TokenFiltersDescriptor KeepWords(string name, Func selector) => Assign(name, selector?.Invoke(new KeepWordsTokenFilterDescriptor())); ///

/// Protects words from being modified by stemmers. Must be placed before any stemming filters. ///

public TokenFiltersDescriptor KeywordMarker(string name, Func selector) => Assign(name, selector?.Invoke(new KeywordMarkerTokenFilterDescriptor())); ///

/// The kstem token filter is a high performance filter for english. /// All terms must already be lowercased (use lowercase filter) for this filter to work correctly. ///

public TokenFiltersDescriptor KStem(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new KStemTokenFilterDescriptor())); ///

/// A token filter of type length that removes words that are too long or too short for the stream. ///

public TokenFiltersDescriptor Length(string name, Func selector) => Assign(name, selector?.Invoke(new LengthTokenFilterDescriptor())); ///

/// Limits the number of tokens that are indexed per document and field. ///

public TokenFiltersDescriptor LimitTokenCount(string name, Func selector ) => Assign(name, selector?.Invoke(new LimitTokenCountTokenFilterDescriptor())); ///

/// A token filter of type lowercase that normalizes token text to lower case. /// Lowercase token filter supports Greek and Turkish lowercase token filters through the language parameter. ///

public TokenFiltersDescriptor Lowercase(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new LowercaseTokenFilterDescriptor())); ///

/// A token filter of type nGram. ///

public TokenFiltersDescriptor NGram(string name, Func selector) => Assign(name, selector?.Invoke(new NGramTokenFilterDescriptor())); ///

/// The pattern_capture token filter, unlike the pattern tokenizer, emits a token for every capture group in the regular /// expression. ///

public TokenFiltersDescriptor PatternCapture(string name, Func selector) => Assign(name, selector?.Invoke(new PatternCaptureTokenFilterDescriptor())); ///

/// The pattern_replace token filter allows to easily handle string replacements based on a regular expression. ///

public TokenFiltersDescriptor PatternReplace(string name, Func selector) => Assign(name, selector?.Invoke(new PatternReplaceTokenFilterDescriptor())); ///

/// A token filter of type porterStem that transforms the token stream as per the Porter stemming algorithm. ///

public TokenFiltersDescriptor PorterStem(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new PorterStemTokenFilterDescriptor())); ///

/// A token filter of type reverse that simply reverses the tokens. ///

public TokenFiltersDescriptor Reverse(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new ReverseTokenFilterDescriptor())); ///

/// A filter that stems words using a Snowball-generated stemmer. ///

public TokenFiltersDescriptor Snowball(string name, Func selector) => Assign(name, selector?.Invoke(new SnowballTokenFilterDescriptor())); ///

/// A filter that stems words (similar to snowball, but with more options). ///

public TokenFiltersDescriptor Stemmer(string name, Func selector) => Assign(name, selector?.Invoke(new StemmerTokenFilterDescriptor())); /// public TokenFiltersDescriptor Predicate(string name, Func selector) => Assign(name, selector?.Invoke(new PredicateTokenFilterDescriptor())); /// public TokenFiltersDescriptor Condition(string name, Func selector) => Assign(name, selector?.Invoke(new ConditionTokenFilterDescriptor())); ///

/// Overrides stemming algorithms, by applying a custom mapping, then protecting these terms from being modified by /// stemmers. Must be placed /// before any stemming filters. ///

public TokenFiltersDescriptor StemmerOverride(string name, Func selector ) => Assign(name, selector?.Invoke(new StemmerOverrideTokenFilterDescriptor())); ///

/// The trim token filter trims surrounding whitespaces around a token. ///

public TokenFiltersDescriptor Trim(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new TrimTokenFilterDescriptor())); ///

/// The truncate token filter can be used to truncate tokens into a specific length. This can come in handy with keyword /// (single token) /// based mapped fields that are used for sorting in order to reduce memory usage. ///

public TokenFiltersDescriptor Truncate(string name, Func selector) => Assign(name, selector?.Invoke(new TruncateTokenFilterDescriptor())); ///

/// The unique token filter can be used to only index unique tokens during analysis. By default it is applied on all the /// token stream ///

public TokenFiltersDescriptor Unique(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new UniqueTokenFilterDescriptor())); ///

/// A token filter of type uppercase that normalizes token text to upper case. ///

public TokenFiltersDescriptor Uppercase(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new UppercaseTokenFilterDescriptor())); ///

/// A token filter of type fingerprint The fingerprint token filter that emits a single token which is useful /// for fingerprinting a body of text, and/or providing a token that can be clustered on. /// It does this by sorting the tokens, deduplicating and then concatenating them back into a single token. ///

public TokenFiltersDescriptor Fingerprint(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new FingerprintTokenFilterDescriptor())); ///

/// The kuromoji_stemmer token filter normalizes common katakana spelling variations ending in a /// long sound character by removing this character (U+30FC). Only full-width katakana characters are supported. /// Part of the `analysis-kuromoji` plugin: /// ///

public TokenFiltersDescriptor KuromojiStemmer(string name, Func selector = null ) => Assign(name, selector.InvokeOrDefault(new KuromojiStemmerTokenFilterDescriptor())); ///

/// The kuromoji_readingform token filter replaces the token with its reading form in either katakana or romaji. /// Part of the `analysis-kuromoji` plugin: /// ///

public TokenFiltersDescriptor KuromojiReadingForm(string name, Func selector ) => Assign(name, selector.Invoke(new KuromojiReadingFormTokenFilterDescriptor())); ///

/// The kuromoji_part_of_speech token filter removes tokens that match a set of part-of-speech tags. /// Part of the `analysis-kuromoji` plugin: /// ///

public TokenFiltersDescriptor KuromojiPartOfSpeech(string name, Func selector ) => Assign(name, selector.Invoke(new KuromojiPartOfSpeechTokenFilterDescriptor())); ///

/// Collations are used for sorting documents in a language-specific word order. The icu_collation token filter is /// available to all indices and /// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting. /// Part of the `analysis-icu` plugin: ///

public TokenFiltersDescriptor IcuCollation(string name, Func selector) => Assign(name, selector.Invoke(new IcuCollationTokenFilterDescriptor())); ///

/// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids. /// Part of the `analysis-icu` plugin: ///

public TokenFiltersDescriptor IcuFolding(string name, Func selector) => Assign(name, selector.Invoke(new IcuFoldingTokenFilterDescriptor())); ///

/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization /// Part of the `analysis-icu` plugin: ///

public TokenFiltersDescriptor IcuNormalization(string name, Func selector ) => Assign(name, selector.Invoke(new IcuNormalizationTokenFilterDescriptor())); ///

/// Transforms are used to process Unicode text in many different ways, such as case mapping, /// normalization, transliteration and bidirectional text handling. /// Part of the `analysis-icu` plugin: ///

public TokenFiltersDescriptor IcuTransform(string name, Func selector) => Assign(name, selector.Invoke(new IcuTransformTokenFilterDescriptor())); /// public TokenFiltersDescriptor NoriPartOfSpeech(string name, Func selector ) => Assign(name, selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor())); ///

/// A token filter of type multiplexer will emit multiple tokens at the same position, each version of the token /// having been run through a different filter. Identical output tokens at the same position will be removed. ///

public TokenFiltersDescriptor Multiplexer(string name, Func selector) => Assign(name, selector.Invoke(new MultiplexerTokenFilterDescriptor())); ///

A token filter of type remove_duplicates that drops identical tokens at the same position.

public TokenFiltersDescriptor RemoveDuplicates(string name, Func selector = null ) => Assign(name, selector.InvokeOrDefault(new RemoveDuplicatesTokenFilterDescriptor())); } }