/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System; using System.Collections.Generic; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { [JsonFormatter(typeof(VerbatimDictionaryKeysFormatter))] public interface ITokenFilters : IIsADictionary { } public class TokenFilters : IsADictionaryBase, ITokenFilters { public TokenFilters() { } public TokenFilters(IDictionary container) : base(container) { } public TokenFilters(Dictionary container) : base(container) { } public void Add(string name, ITokenFilter analyzer) => BackingDictionary.Add(name, analyzer); } public class TokenFiltersDescriptor : IsADictionaryDescriptorBase { public TokenFiltersDescriptor() : base(new TokenFilters()) { } public TokenFiltersDescriptor UserDefined(string name, ITokenFilter analyzer) => Assign(name, analyzer); /// /// Token filters that allow to decompose compound words using a dictionary /// public TokenFiltersDescriptor DictionaryDecompounder(string name, Func selector ) => Assign(name, selector?.Invoke(new DictionaryDecompounderTokenFilterDescriptor())); /// /// Token filters that allow to decompose compound words using FOP XML /// public TokenFiltersDescriptor HyphenationDecompounder(string name, Func selector ) => Assign(name, selector?.Invoke(new HyphenationDecompounderTokenFilterDescriptor())); /// /// A token filter of type edgeNGram. /// public TokenFiltersDescriptor EdgeNGram(string name, Func selector) => Assign(name, selector?.Invoke(new EdgeNGramTokenFilterDescriptor())); /// /// The phonetic token filter is provided as a plugin. /// public TokenFiltersDescriptor Phonetic(string name, Func selector) => Assign(name, selector?.Invoke(new PhoneticTokenFilterDescriptor())); /// /// A token filter of type shingle that constructs shingles (token n-grams) from a token stream. /// In other words, it creates combinations of tokens as a single token. /// public TokenFiltersDescriptor Shingle(string name, Func selector) => Assign(name, selector?.Invoke(new ShingleTokenFilterDescriptor())); /// /// A token filter of type stop that removes stop words from token streams. /// public TokenFiltersDescriptor Stop(string name, Func selector) => Assign(name, selector?.Invoke(new StopTokenFilterDescriptor())); /// /// The synonym token filter allows to easily handle synonyms during the analysis process. /// public TokenFiltersDescriptor Synonym(string name, Func selector) => Assign(name, selector?.Invoke(new SynonymTokenFilterDescriptor())); /// /// The synonym_graph token filter allows to easily handle synonyms, /// including multi-word synonyms correctly during the analysis process. /// public TokenFiltersDescriptor SynonymGraph(string name, Func selector) => Assign(name, selector?.Invoke(new SynonymGraphTokenFilterDescriptor())); /// /// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are /// /// not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one /// exists. /// /// public TokenFiltersDescriptor WordDelimiter(string name, Func selector) => Assign(name, selector?.Invoke(new WordDelimiterTokenFilterDescriptor())); /// /// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are /// /// not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one /// exists. /// /// public TokenFiltersDescriptor WordDelimiterGraph(string name, Func selector ) => Assign(name, selector?.Invoke(new WordDelimiterGraphTokenFilterDescriptor())); /// /// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are /// /// not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one /// exists. /// /// public TokenFiltersDescriptor AsciiFolding(string name, Func selector) => Assign(name, selector?.Invoke(new AsciiFoldingTokenFilterDescriptor())); /// /// Token filter that generates bigrams for frequently occuring terms. Single terms are still indexed. /// Note, common_words or common_words_path field is required. /// public TokenFiltersDescriptor CommonGrams(string name, Func selector) => Assign(name, selector?.Invoke(new CommonGramsTokenFilterDescriptor())); /// /// Splits tokens into tokens and payload whenever a delimiter character is found. /// public TokenFiltersDescriptor DelimitedPayload(string name, Func selector ) => Assign(name, selector?.Invoke(new DelimitedPayloadTokenFilterDescriptor())); /// /// A token filter which removes elisions. For example, “l’avion” (the plane) will tokenized as “avion” (plane). /// public TokenFiltersDescriptor Elision(string name, Func selector) => Assign(name, selector?.Invoke(new ElisionTokenFilterDescriptor())); /// /// Basic support for hunspell stemming. /// Hunspell dictionaries will be picked up from a dedicated hunspell directory on the filesystem. /// public TokenFiltersDescriptor Hunspell(string name, Func selector) => Assign(name, selector?.Invoke(new HunspellTokenFilterDescriptor())); /// /// A token filter of type keep that only keeps tokens with text contained in a predefined set of words. /// public TokenFiltersDescriptor KeepTypes(string name, Func selector) => Assign(name, selector?.Invoke(new KeepTypesTokenFilterDescriptor())); /// /// A token filter of type keep that only keeps tokens with text contained in a predefined set of words. /// public TokenFiltersDescriptor KeepWords(string name, Func selector) => Assign(name, selector?.Invoke(new KeepWordsTokenFilterDescriptor())); /// /// Protects words from being modified by stemmers. Must be placed before any stemming filters. /// public TokenFiltersDescriptor KeywordMarker(string name, Func selector) => Assign(name, selector?.Invoke(new KeywordMarkerTokenFilterDescriptor())); /// /// The kstem token filter is a high performance filter for english. /// All terms must already be lowercased (use lowercase filter) for this filter to work correctly. /// public TokenFiltersDescriptor KStem(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new KStemTokenFilterDescriptor())); /// /// A token filter of type length that removes words that are too long or too short for the stream. /// public TokenFiltersDescriptor Length(string name, Func selector) => Assign(name, selector?.Invoke(new LengthTokenFilterDescriptor())); /// /// Limits the number of tokens that are indexed per document and field. /// public TokenFiltersDescriptor LimitTokenCount(string name, Func selector ) => Assign(name, selector?.Invoke(new LimitTokenCountTokenFilterDescriptor())); /// /// A token filter of type lowercase that normalizes token text to lower case. /// Lowercase token filter supports Greek and Turkish lowercase token filters through the language parameter. /// public TokenFiltersDescriptor Lowercase(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new LowercaseTokenFilterDescriptor())); /// /// A token filter of type nGram. /// public TokenFiltersDescriptor NGram(string name, Func selector) => Assign(name, selector?.Invoke(new NGramTokenFilterDescriptor())); /// /// The pattern_capture token filter, unlike the pattern tokenizer, emits a token for every capture group in the regular /// expression. /// public TokenFiltersDescriptor PatternCapture(string name, Func selector) => Assign(name, selector?.Invoke(new PatternCaptureTokenFilterDescriptor())); /// /// The pattern_replace token filter allows to easily handle string replacements based on a regular expression. /// public TokenFiltersDescriptor PatternReplace(string name, Func selector) => Assign(name, selector?.Invoke(new PatternReplaceTokenFilterDescriptor())); /// /// A token filter of type porterStem that transforms the token stream as per the Porter stemming algorithm. /// public TokenFiltersDescriptor PorterStem(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new PorterStemTokenFilterDescriptor())); /// /// A token filter of type reverse that simply reverses the tokens. /// public TokenFiltersDescriptor Reverse(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new ReverseTokenFilterDescriptor())); /// /// A filter that stems words using a Snowball-generated stemmer. /// public TokenFiltersDescriptor Snowball(string name, Func selector) => Assign(name, selector?.Invoke(new SnowballTokenFilterDescriptor())); /// /// A filter that stems words (similar to snowball, but with more options). /// public TokenFiltersDescriptor Stemmer(string name, Func selector) => Assign(name, selector?.Invoke(new StemmerTokenFilterDescriptor())); /// public TokenFiltersDescriptor Predicate(string name, Func selector) => Assign(name, selector?.Invoke(new PredicateTokenFilterDescriptor())); /// public TokenFiltersDescriptor Condition(string name, Func selector) => Assign(name, selector?.Invoke(new ConditionTokenFilterDescriptor())); /// /// Overrides stemming algorithms, by applying a custom mapping, then protecting these terms from being modified by /// stemmers. Must be placed /// before any stemming filters. /// public TokenFiltersDescriptor StemmerOverride(string name, Func selector ) => Assign(name, selector?.Invoke(new StemmerOverrideTokenFilterDescriptor())); /// /// The trim token filter trims surrounding whitespaces around a token. /// public TokenFiltersDescriptor Trim(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new TrimTokenFilterDescriptor())); /// /// The truncate token filter can be used to truncate tokens into a specific length. This can come in handy with keyword /// (single token) /// based mapped fields that are used for sorting in order to reduce memory usage. /// public TokenFiltersDescriptor Truncate(string name, Func selector) => Assign(name, selector?.Invoke(new TruncateTokenFilterDescriptor())); /// /// The unique token filter can be used to only index unique tokens during analysis. By default it is applied on all the /// token stream /// public TokenFiltersDescriptor Unique(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new UniqueTokenFilterDescriptor())); /// /// A token filter of type uppercase that normalizes token text to upper case. /// public TokenFiltersDescriptor Uppercase(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new UppercaseTokenFilterDescriptor())); /// /// A token filter of type fingerprint The fingerprint token filter that emits a single token which is useful /// for fingerprinting a body of text, and/or providing a token that can be clustered on. /// It does this by sorting the tokens, deduplicating and then concatenating them back into a single token. /// public TokenFiltersDescriptor Fingerprint(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new FingerprintTokenFilterDescriptor())); /// /// The kuromoji_stemmer token filter normalizes common katakana spelling variations ending in a /// long sound character by removing this character (U+30FC). Only full-width katakana characters are supported. /// Part of the `analysis-kuromoji` plugin: /// /// public TokenFiltersDescriptor KuromojiStemmer(string name, Func selector = null ) => Assign(name, selector.InvokeOrDefault(new KuromojiStemmerTokenFilterDescriptor())); /// /// The kuromoji_readingform token filter replaces the token with its reading form in either katakana or romaji. /// Part of the `analysis-kuromoji` plugin: /// /// public TokenFiltersDescriptor KuromojiReadingForm(string name, Func selector ) => Assign(name, selector.Invoke(new KuromojiReadingFormTokenFilterDescriptor())); /// /// The kuromoji_part_of_speech token filter removes tokens that match a set of part-of-speech tags. /// Part of the `analysis-kuromoji` plugin: /// /// public TokenFiltersDescriptor KuromojiPartOfSpeech(string name, Func selector ) => Assign(name, selector.Invoke(new KuromojiPartOfSpeechTokenFilterDescriptor())); /// /// Collations are used for sorting documents in a language-specific word order. The icu_collation token filter is /// available to all indices and /// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting. /// Part of the `analysis-icu` plugin: /// public TokenFiltersDescriptor IcuCollation(string name, Func selector) => Assign(name, selector.Invoke(new IcuCollationTokenFilterDescriptor())); /// /// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids. /// Part of the `analysis-icu` plugin: /// public TokenFiltersDescriptor IcuFolding(string name, Func selector) => Assign(name, selector.Invoke(new IcuFoldingTokenFilterDescriptor())); /// /// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization /// Part of the `analysis-icu` plugin: /// public TokenFiltersDescriptor IcuNormalization(string name, Func selector ) => Assign(name, selector.Invoke(new IcuNormalizationTokenFilterDescriptor())); /// /// Transforms are used to process Unicode text in many different ways, such as case mapping, /// normalization, transliteration and bidirectional text handling. /// Part of the `analysis-icu` plugin: /// public TokenFiltersDescriptor IcuTransform(string name, Func selector) => Assign(name, selector.Invoke(new IcuTransformTokenFilterDescriptor())); /// public TokenFiltersDescriptor NoriPartOfSpeech(string name, Func selector ) => Assign(name, selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor())); /// /// A token filter of type multiplexer will emit multiple tokens at the same position, each version of the token /// having been run through a different filter. Identical output tokens at the same position will be removed. /// public TokenFiltersDescriptor Multiplexer(string name, Func selector) => Assign(name, selector.Invoke(new MultiplexerTokenFilterDescriptor())); /// A token filter of type remove_duplicates that drops identical tokens at the same position. public TokenFiltersDescriptor RemoveDuplicates(string name, Func selector = null ) => Assign(name, selector.InvokeOrDefault(new RemoveDuplicatesTokenFilterDescriptor())); } }