/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System; using OpenSearch.Client; using OpenSearch.Client.Specification.IndicesApi; using Tests.IndexModules; namespace Tests.Analysis.TokenFilters { /** */ public class TokenFilterUsageTests : PromiseUsageTestBase { public static Func> FluentExample => s => s .Analysis(analysis => analysis .TokenFilters(tf => tf .AsciiFolding("myAscii", t => t.PreserveOriginal()) .CommonGrams("myCommonGrams", t => t .CommonWords("x", "y", "z") .IgnoreCase() .QueryMode() ) .DelimitedPayload("mydp", t => t .Delimiter('-') .Encoding(DelimitedPayloadEncoding.Identity) ) .DictionaryDecompounder("dcc", t => t .MaxSubwordSize(2) .MinSubwordSize(2) .MinWordSize(2) .OnlyLongestMatch() .WordList("x", "y", "z") ) .EdgeNGram("etf", t => t .MaxGram(2) .MinGram(1) ) .Elision("elision", t => t .Articles("a", "b", "c") ) .Hunspell("hunspell", t => t .Dedup() .Dictionary("path_to_dict") .Locale("en_US") .LongestOnly() ) .HyphenationDecompounder("hypdecomp", t => t .MaxSubwordSize(2) .MinSubwordSize(2) .MinWordSize(2) .OnlyLongestMatch() .WordList("x", "y", "z") .HyphenationPatternsPath("analysis/fop.xml") ) .KeepTypes("keeptypes", t => t .Types("", "") ) .KeepWords("keepwords", t => t .KeepWords("a", "b", "c") .KeepWordsCase() ) .KeywordMarker("marker", t => t .IgnoreCase() .Keywords("a", "b") ) .KStem("kstem") .Length("length", t => t .Max(200) .Min(10) ) .LimitTokenCount("limit", t => t .ConsumeAllToken() .MaxTokenCount(12) ) .Lowercase("lc") .NGram("ngram", t => t .MinGram(3) .MaxGram(4) ) .PatternCapture("pc", t => t .Patterns(@"\d", @"\w") .PreserveOriginal() ) .PatternReplace("pr", t => t .Pattern(@"(\d|\w)") .Replacement("replacement") ) .PorterStem("porter") .Reverse("rev") .Shingle("shing", t => t .FillerToken("x") .MaxShingleSize(10) .MinShingleSize(8) .OutputUnigrams() .OutputUnigramsIfNoShingles() .TokenSeparator("|") ) .Snowball("snow", t => t.Language(SnowballLanguage.Dutch)) .Stemmer("stem", t => t.Language("arabic")) .StemmerOverride("stemo", t => t.RulesPath("analysis/custom_stems.txt")) .Stop("stop", t => t .IgnoreCase() .RemoveTrailing() .StopWords("x", "y", "z") ) .Synonym("syn", t => t .Expand() .Format(SynonymFormat.WordNet) .SynonymsPath("analysis/stopwords.txt") .Synonyms("x=>y", "z=>s") .Tokenizer("whitespace") ) .SynonymGraph("syn_graph", t => t .Expand() .Format(SynonymFormat.WordNet) .SynonymsPath("analysis/stopwords.txt") .Synonyms("x=>y", "z=>s") .Tokenizer("whitespace") ) .Trim("trimmer") .Truncate("truncer", t => t.Length(100)) .Unique("uq", t => t.OnlyOnSamePosition()) .Uppercase("upper") .WordDelimiter("wd", t => t .CatenateAll() .CatenateNumbers() .CatenateWords() .GenerateNumberParts() .GenerateWordParts() .PreserveOriginal() .ProtectedWords("x", "y", "z") .SplitOnCaseChange() .SplitOnNumerics() .StemEnglishPossessive() ) .WordDelimiterGraph("wdg", t => t .CatenateAll() .CatenateNumbers() .CatenateWords() .GenerateNumberParts() .GenerateWordParts() .IgnoreKeywords() .PreserveOriginal() .ProtectedWords("x", "y", "z") .SplitOnCaseChange() .SplitOnNumerics() .StemEnglishPossessive() ) .KuromojiPartOfSpeech("kpos", t => t .StopTags("# verb-main:", "動詞-自立") ) .KuromojiReadingForm("kfr", t => t .UseRomaji() ) .KuromojiStemmer("ks", t => t .MinimumLength(4) ) .IcuCollation("icuc", t => t .Alternate(IcuCollationAlternate.NonIgnorable) .CaseFirst(IcuCollationCaseFirst.Lower) .HiraganaQuaternaryMode() .Decomposition(IcuCollationDecomposition.No) .Numeric() .CaseLevel() .Country("DE") .Language("de") .Strength(IcuCollationStrength.Tertiary) .Variant("@collation=phonebook") ) .IcuFolding("icuf", t => t.UnicodeSetFilter("[^åäöÅÄÖ]")) .IcuNormalization("icun", t => t.Name(IcuNormalizationType.Canonical)) .IcuTransform("icut", t => t .Direction(IcuTransformDirection.Forward) .Id("Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC") ) .Phonetic("phonetic", t => t .Encoder(PhoneticEncoder.Beidermorse) .RuleType(PhoneticRuleType.Exact) .NameType(PhoneticNameType.Sephardic) .LanguageSet(PhoneticLanguage.Cyrillic, PhoneticLanguage.English, PhoneticLanguage.Hebrew) ) ) ); public static IndexSettings InitializerExample => new IndexSettings { Analysis = new OpenSearch.Client.Analysis { TokenFilters = new OpenSearch.Client.TokenFilters { { "myAscii", new AsciiFoldingTokenFilter { PreserveOriginal = true } }, { "myCommonGrams", new CommonGramsTokenFilter { QueryMode = true, IgnoreCase = true, CommonWords = new[] { "x", "y", "z" } } }, { "mydp", new DelimitedPayloadTokenFilter { Delimiter = '-', Encoding = DelimitedPayloadEncoding.Identity } }, { "dcc", new DictionaryDecompounderTokenFilter { MinWordSize = 2, MinSubwordSize = 2, MaxSubwordSize = 2, OnlyLongestMatch = true, WordList = new[] { "x", "y", "z" } } }, { "etf", new EdgeNGramTokenFilter { MaxGram = 2, MinGram = 1 } }, { "elision", new ElisionTokenFilter { Articles = new[] { "a", "b", "c" } } }, { "hunspell", new HunspellTokenFilter { Dedup = true, Dictionary = "path_to_dict", Locale = "en_US", LongestOnly = true } }, { "hypdecomp", new HyphenationDecompounderTokenFilter { MaxSubwordSize = 2, MinSubwordSize = 2, MinWordSize = 2, OnlyLongestMatch = true, WordList = new[] { "x", "y", "z" }, HyphenationPatternsPath = "analysis/fop.xml" } }, { "keeptypes", new KeepTypesTokenFilter { Types = new[] { "", "" } } }, { "keepwords", new KeepWordsTokenFilter { KeepWordsCase = true, KeepWords = new[] { "a", "b", "c" } } }, { "marker", new KeywordMarkerTokenFilter { IgnoreCase = true, Keywords = new[] { "a", "b" } } }, { "kstem", new KStemTokenFilter() }, { "length", new LengthTokenFilter { Min = 10, Max = 200 } }, { "limit", new LimitTokenCountTokenFilter { ConsumeAllTokens = true, MaxTokenCount = 12 } }, { "lc", new LowercaseTokenFilter() }, { "ngram", new NGramTokenFilter { MinGram = 3, MaxGram = 4 } }, { "pc", new PatternCaptureTokenFilter { Patterns = new[] { @"\d", @"\w" }, PreserveOriginal = true } }, { "pr", new PatternReplaceTokenFilter { Pattern = @"(\d|\w)", Replacement = "replacement" } }, { "porter", new PorterStemTokenFilter() }, { "rev", new ReverseTokenFilter() }, { "shing", new ShingleTokenFilter { FillerToken = "x", MaxShingleSize = 10, MinShingleSize = 8, OutputUnigrams = true, OutputUnigramsIfNoShingles = true, TokenSeparator = "|" } }, { "snow", new SnowballTokenFilter { Language = SnowballLanguage.Dutch } }, { "stem", new StemmerTokenFilter { Language = "arabic" } }, { "stemo", new StemmerOverrideTokenFilter { RulesPath = "analysis/custom_stems.txt" } }, { "stop", new StopTokenFilter { IgnoreCase = true, RemoveTrailing = true, StopWords = new[] { "x", "y", "z" } } }, { "syn", new SynonymTokenFilter { Expand = true, Format = SynonymFormat.WordNet, SynonymsPath = "analysis/stopwords.txt", Synonyms = new[] { "x=>y", "z=>s" }, Tokenizer = "whitespace" } }, { "syn_graph", new SynonymGraphTokenFilter { Expand = true, Format = SynonymFormat.WordNet, SynonymsPath = "analysis/stopwords.txt", Synonyms = new[] { "x=>y", "z=>s" }, Tokenizer = "whitespace" } }, { "trimmer", new TrimTokenFilter() }, { "truncer", new TruncateTokenFilter { Length = 100 } }, { "uq", new UniqueTokenFilter { OnlyOnSamePosition = true, } }, { "upper", new UppercaseTokenFilter() }, { "wd", new WordDelimiterTokenFilter { CatenateAll = true, CatenateNumbers = true, CatenateWords = true, GenerateNumberParts = true, GenerateWordParts = true, PreserveOriginal = true, ProtectedWords = new[] { "x", "y", "z" }, SplitOnCaseChange = true, SplitOnNumerics = true, StemEnglishPossessive = true } }, { "wdg", new WordDelimiterGraphTokenFilter { CatenateAll = true, CatenateNumbers = true, CatenateWords = true, GenerateNumberParts = true, GenerateWordParts = true, IgnoreKeywords = true, PreserveOriginal = true, ProtectedWords = new[] { "x", "y", "z" }, SplitOnCaseChange = true, SplitOnNumerics = true, StemEnglishPossessive = true } }, { "kpos", new KuromojiPartOfSpeechTokenFilter { StopTags = new[] { "# verb-main:", "動詞-自立" } } }, { "kfr", new KuromojiReadingFormTokenFilter { UseRomaji = true } }, { "ks", new KuromojiStemmerTokenFilter { MinimumLength = 4 } }, { "icuc", new IcuCollationTokenFilter { Alternate = IcuCollationAlternate.NonIgnorable, CaseFirst = IcuCollationCaseFirst.Lower, HiraganaQuaternaryMode = true, Decomposition = IcuCollationDecomposition.No, Numeric = true, CaseLevel = true, Country = "DE", Language = "de", Strength = IcuCollationStrength.Tertiary, Variant = "@collation=phonebook" } }, { "icuf", new IcuFoldingTokenFilter { UnicodeSetFilter = "[^åäöÅÄÖ]" } }, { "icun", new IcuNormalizationTokenFilter { Name = IcuNormalizationType.Canonical } }, { "icut", new IcuTransformTokenFilter { Direction = IcuTransformDirection.Forward, Id = "Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC" } }, { "phonetic", new PhoneticTokenFilter { Encoder = PhoneticEncoder.Beidermorse, RuleType = PhoneticRuleType.Exact, NameType = PhoneticNameType.Sephardic, LanguageSet = new[] { PhoneticLanguage.Cyrillic, PhoneticLanguage.English, PhoneticLanguage.Hebrew } } }, } } }; protected override object ExpectJson => new { analysis = new { filter = new { myAscii = new { type = "asciifolding", preserve_original = true }, myCommonGrams = new { type = "common_grams", common_words = new[] { "x", "y", "z" }, ignore_case = true, query_mode = true }, mydp = new { type = "delimited_payload", delimiter = "-", encoding = "identity" }, dcc = new { type = "dictionary_decompounder", word_list = new[] { "x", "y", "z" }, min_word_size = 2, min_subword_size = 2, max_subword_size = 2, only_longest_match = true }, etf = new { type = "edge_ngram", min_gram = 1, max_gram = 2 }, elision = new { type = "elision", articles = new[] { "a", "b", "c" } }, hunspell = new { type = "hunspell", locale = "en_US", dictionary = "path_to_dict", dedup = true, longest_only = true }, hypdecomp = new { type = "hyphenation_decompounder", word_list = new[] { "x", "y", "z" }, min_word_size = 2, min_subword_size = 2, max_subword_size = 2, only_longest_match = true, hyphenation_patterns_path = "analysis/fop.xml" }, keeptypes = new { type = "keep_types", types = new[] { "", "" } }, icuc = new { alternate = "non-ignorable", caseFirst = "lower", caseLevel = true, country = "DE", decomposition = "no", hiraganaQuaternaryMode = true, language = "de", numeric = true, strength = "tertiary", type = "icu_collation", variant = "@collation=phonebook" }, icuf = new { type = "icu_folding", unicodeSetFilter = "[^åäöÅÄÖ]" }, icun = new { name = "nfc", type = "icu_normalizer" }, icut = new { dir = "forward", id = "Any-Latin; NFD; [:Nonspacing Mark:] Remove; NFC", type = "icu_transform" }, keepwords = new { type = "keep", keep_words = new[] { "a", "b", "c" }, keep_words_case = true }, marker = new { type = "keyword_marker", keywords = new[] { "a", "b" }, ignore_case = true }, kfr = new { type = "kuromoji_readingform", use_romaji = true }, kpos = new { stoptags = new[] { "# verb-main:", "動詞-自立" }, type = "kuromoji_part_of_speech" }, ks = new { minimum_length = 4, type = "kuromoji_stemmer" }, kstem = new { type = "kstem" }, length = new { type = "length", min = 10, max = 200 }, limit = new { type = "limit", max_token_count = 12, consume_all_tokens = true }, lc = new { type = "lowercase" }, ngram = new { type = "ngram", min_gram = 3, max_gram = 4 }, pc = new { type = "pattern_capture", patterns = new[] { "\\d", "\\w" }, preserve_original = true }, pr = new { type = "pattern_replace", pattern = "(\\d|\\w)", replacement = "replacement" }, porter = new { type = "porter_stem" }, rev = new { type = "reverse" }, shing = new { type = "shingle", min_shingle_size = 8, max_shingle_size = 10, output_unigrams = true, output_unigrams_if_no_shingles = true, token_separator = "|", filler_token = "x" }, snow = new { type = "snowball", language = "Dutch" }, standard = new { type = "standard" }, stem = new { type = "stemmer", language = "arabic" }, stemo = new { type = "stemmer_override", rules_path = "analysis/custom_stems.txt" }, stop = new { type = "stop", stopwords = new[] { "x", "y", "z" }, ignore_case = true, remove_trailing = true }, syn = new { type = "synonym", synonyms_path = "analysis/stopwords.txt", format = "wordnet", synonyms = new[] { "x=>y", "z=>s" }, expand = true, tokenizer = "whitespace" }, syn_graph = new { type = "synonym_graph", synonyms_path = "analysis/stopwords.txt", format = "wordnet", synonyms = new[] { "x=>y", "z=>s" }, expand = true, tokenizer = "whitespace" }, trimmer = new { type = "trim" }, truncer = new { type = "truncate", length = 100 }, uq = new { type = "unique", only_on_same_position = true }, upper = new { type = "uppercase" }, wd = new { type = "word_delimiter", generate_word_parts = true, generate_number_parts = true, catenate_words = true, catenate_numbers = true, catenate_all = true, split_on_case_change = true, preserve_original = true, split_on_numerics = true, stem_english_possessive = true, protected_words = new[] { "x", "y", "z" } }, wdg = new { type = "word_delimiter_graph", generate_word_parts = true, generate_number_parts = true, ignore_keywords = true, catenate_words = true, catenate_numbers = true, catenate_all = true, split_on_case_change = true, preserve_original = true, split_on_numerics = true, stem_english_possessive = true, protected_words = new[] { "x", "y", "z" } }, phonetic = new { type = "phonetic", encoder = "beider_morse", rule_type = "exact", name_type = "sephardic", languageset = new[] { "cyrillic", "english", "hebrew" } } } } }; /** * */ protected override Func> Fluent => FluentExample; /** */ protected override IndexSettings Initializer => InitializerExample; } }