/* SPDX-License-Identifier: Apache-2.0 * * The OpenSearch Contributors require contributions made to * this file be licensed under the Apache-2.0 license or a * compatible open source license. */ /* * Modifications Copyright OpenSearch Contributors. See * GitHub history for details. * * Licensed to Elasticsearch B.V. under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch B.V. licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ using System.Collections.Generic; using System.Runtime.Serialization; using OpenSearch.Net.Utf8Json; namespace OpenSearch.Client { /// /// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful /// for cases where a simple custom tokenization is desired, and the overhead of use of is not acceptable. /// public interface ICharGroupTokenizer : ITokenizer { /// /// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a /// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit, /// punctuation, symbol. /// [DataMember(Name ="tokenize_on_chars")] IEnumerable TokenizeOnCharacters { get; set; } /// /// The maximum token length. If a token is seen that exceeds this length then /// it is split at intervals. Defaults to `255`. /// [DataMember(Name = "max_token_length")] [JsonFormatter(typeof(NullableStringIntFormatter))] int? MaxTokenLength { get; set; } } /// public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer { internal const string TokenizerType = "char_group"; public CharGroupTokenizer() => Type = TokenizerType; /// public IEnumerable TokenizeOnCharacters { get; set; } /// public int? MaxTokenLength { get; set; } } /// public class CharGroupTokenizerDescriptor : TokenizerDescriptorBase, ICharGroupTokenizer { protected override string Type => CharGroupTokenizer.TokenizerType; IEnumerable ICharGroupTokenizer.TokenizeOnCharacters { get; set; } int? ICharGroupTokenizer.MaxTokenLength { get; set; } /// public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) => Assign(characters, (a, v) => a.TokenizeOnCharacters = v); /// public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable characters) => Assign(characters, (a, v) => a.TokenizeOnCharacters = v); /// public CharGroupTokenizerDescriptor MaxTokenLength(int? maxTokenLength) => Assign(maxTokenLength, (a, v) => a.MaxTokenLength = v); } }