# -*- coding: utf-8 -*- # SPDX-License-Identifier: Apache-2.0 # # The OpenSearch Contributors require contributions made to # this file be licensed under the Apache-2.0 license or a # compatible open source license. # # Modifications Copyright OpenSearch Contributors. See # GitHub history for details. # # Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from pytest import raises from opensearch_dsl import analysis def test_analyzer_serializes_as_name(): a = analysis.analyzer("my_analyzer") assert "my_analyzer" == a.to_dict() def test_analyzer_has_definition(): a = analysis.CustomAnalyzer( "my_analyzer", tokenizer="keyword", filter=["lowercase"] ) assert { "type": "custom", "tokenizer": "keyword", "filter": ["lowercase"], } == a.get_definition() def test_simple_multiplexer_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=["lowercase", "lowercase, stop"] ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "my_multi": { "filters": ["lowercase", "lowercase, stop"], "type": "multiplexer", } }, } == a.get_analysis_definition() def test_multiplexer_with_custom_filter(): a = analysis.analyzer( "my_analyzer", tokenizer="keyword", filter=[ analysis.token_filter( "my_multi", "multiplexer", filters=[ [analysis.token_filter("en", "snowball", language="English")], "lowercase, stop", ], ) ], ) assert { "analyzer": { "my_analyzer": { "filter": ["my_multi"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"type": "snowball", "language": "English"}, "my_multi": {"filters": ["en", "lowercase, stop"], "type": "multiplexer"}, }, } == a.get_analysis_definition() def test_conditional_token_filter(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), "stop", ], ) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"language": "English", "type": "snowball"}, "testing": { "script": {"source": "return true"}, "filter": ["lowercase", "en"], "type": "condition", }, }, } == a.get_analysis_definition() def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter("en", "stemmer", language="english"), analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), ], ) with raises(ValueError): a.get_analysis_definition() def test_normalizer_serializes_as_name(): n = analysis.normalizer("my_normalizer") assert "my_normalizer" == n.to_dict() def test_normalizer_has_definition(): n = analysis.CustomNormalizer( "my_normalizer", filter=["lowercase", "asciifolding"], char_filter=["quote"] ) assert { "type": "custom", "filter": ["lowercase", "asciifolding"], "char_filter": ["quote"], } == n.get_definition() def test_tokenizer(): t = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) assert t.to_dict() == "trigram" assert {"type": "nGram", "min_gram": 3, "max_gram": 3} == t.get_definition() def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"]) umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ΓΌ=>ue"]) a = analysis.analyzer( "my_analyzer", tokenizer=trigram, filter=["lowercase", my_stop], char_filter=["html_strip", umlauts], ) assert a.to_dict() == "my_analyzer" assert { "analyzer": { "my_analyzer": { "type": "custom", "tokenizer": "trigram", "filter": ["lowercase", "my_stop"], "char_filter": ["html_strip", "umlauts"], } }, "tokenizer": {"trigram": trigram.get_definition()}, "filter": {"my_stop": my_stop.get_definition()}, "char_filter": {"umlauts": umlauts.get_definition()}, } == a.get_analysis_definition() def test_stemmer_analyzer_can_pass_name(): t = analysis.token_filter( "my_english_filter", name="minimal_english", type="stemmer" ) assert t.to_dict() == "my_english_filter" assert {"type": "stemmer", "name": "minimal_english"} == t.get_definition()