# SPDX-License-Identifier: Apache-2.0 # # The OpenSearch Contributors require contributions made to # this file be licensed under the Apache-2.0 license or a # compatible open source license. # Modifications Copyright OpenSearch Contributors. See # GitHub history for details. # Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import io import json import re from unittest import TestCase import pytest import ujson from osbenchmark.worker_coordinator import runner @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_reverse_and_regexp_small(benchmark): benchmark(sort_parsing_candidate_reverse_and_regexp, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_reverse_and_regexp_large(benchmark): benchmark(sort_parsing_candidate_reverse_and_regexp, ParsingBenchmarks.large_page) def sort_parsing_candidate_reverse_and_regexp(response): reversed_response = response[::-1] sort_pattern = r"(\][^\]]*?\[):\"tros\"" x = re.search(sort_pattern, reversed_response) # return json.loads(x.group(1)[::-1]) # mean 3.6 ms return ujson.loads(x.group(1)[::-1]) # mean 1.7 ms @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_rfind_and_regexp_small(benchmark): benchmark(sort_parsing_candidate_rfind_and_regexp, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_rfind_and_regexp_large(benchmark): benchmark(sort_parsing_candidate_rfind_and_regexp, ParsingBenchmarks.large_page) def sort_parsing_candidate_rfind_and_regexp(response): index_of_last_sort = response.rfind('"sort"') sort_pattern = r"sort\":([^\]]*])" x = re.search(sort_pattern, response[index_of_last_sort::]) # return json.loads(x.group(1)[::-1]) return ujson.loads(x.group(1)) @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_end_anchor_regexp(benchmark): benchmark(sort_parsing_candidate_end_anchor_regexp, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_end_anchor_regexp_large(benchmark): benchmark(sort_parsing_candidate_end_anchor_regexp, ParsingBenchmarks.large_page) def sort_parsing_candidate_end_anchor_regexp(response): # predictably, no difference in using a literal lookahead vs just a surrounding pattern. room for improvement? sort_pattern = r"\"sort\":([^\]]*])\}\]\}\}$" x = re.search(sort_pattern, response) # return ast.literal_eval(x.group(1)) # mean 8.6 ms # return json.loads(x.group(1)) # mean 3.2 ms return ujson.loads(x.group(1)) # mean 1.5 ms @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_find_all_regexp_small(benchmark): benchmark(sort_parsing_candidate_find_all, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_sort_find_all_regexp_large(benchmark): benchmark(sort_parsing_candidate_find_all, ParsingBenchmarks.large_page) def sort_parsing_candidate_find_all(response): sort_pattern = r"\"sort\":([^\]]+])" x = re.findall(sort_pattern, response) return ujson.loads(x[-1]) @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_pit_id_regexp_small(benchmark): benchmark(pit_id_parsing_candidate_regexp, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_pit_id_regexp_large(benchmark): benchmark(pit_id_parsing_candidate_regexp, ParsingBenchmarks.large_page) def pit_id_parsing_candidate_regexp(response): pit_id_pattern = r'"pit_id":"([^"]*)"' # 0.9 ms x = re.search(pit_id_pattern, response) return x.group(1) @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_combined_json_small(benchmark): benchmark(combined_parsing_candidate_json_loads, ParsingBenchmarks.small_page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_combined_json_large(benchmark): benchmark(combined_parsing_candidate_json_loads, ParsingBenchmarks.large_page) def combined_parsing_candidate_json_loads(response): parsed_response = json.loads(response) pit_id = parsed_response.get("pit_id") sort = parsed_response.get("hits").get("hits")[-1].get("sort") return pit_id, sort @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_combined_ijson_large(benchmark): benchmark(combined_parsing_candidate_json_loads, ParsingBenchmarks.large_page) @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_combined_ijson_small(benchmark): benchmark(combined_parsing_candidate_json_loads, ParsingBenchmarks.small_page) def combined_parsing_candidate_ijson_loads(response): parsed_response = ujson.loads(response) pit_id = parsed_response.get("pit_id") sort = parsed_response.get("hits").get("hits")[-1].get("sort") return pit_id, sort @pytest.mark.benchmark( group="parse", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_pit_id_parse_small(benchmark): page = ParsingBenchmarks.small_page.encode() benchmark(pit_id_parsing_candidate_runner_parse, page) @pytest.mark.benchmark( group="parse_large", warmup="on", warmup_iterations=10000, disable_gc=True ) def test_pit_id_parse_large(benchmark): page = ParsingBenchmarks.large_page.encode() benchmark(pit_id_parsing_candidate_runner_parse, page) def pit_id_parsing_candidate_runner_parse(response): response_bytes = io.BytesIO(response) parsed = runner.parse(response_bytes, ["pit_id"]) pit_id = parsed["pit_id"] return pit_id class ParsingBenchmarks(TestCase): def test_all_candidates(self): """ Quick utility test to ensure all benchmark cases are correct """ pit_id = pit_id_parsing_candidate_runner_parse(self.small_page.encode()) self.assertEqual("fedcba9876543210", pit_id) sort = sort_parsing_candidate_reverse_and_regexp(self.small_page) self.assertEqual([1609780186,"2"], sort) sort = sort_parsing_candidate_rfind_and_regexp(self.large_page) self.assertEqual([1609780186, "2"], sort) sort = sort_parsing_candidate_end_anchor_regexp(self.small_page) self.assertEqual([1609780186,"2"], sort) sort = sort_parsing_candidate_find_all(self.large_page) self.assertEqual([1609780186,"2"], sort) pit_id = pit_id_parsing_candidate_regexp(self.large_page) self.assertEqual("fedcba9876543210", pit_id) pit_id, sort = combined_parsing_candidate_json_loads(self.small_page) self.assertEqual([1609780186, "2"], sort) self.assertEqual("fedcba9876543210", pit_id) small_page = """ { "pit_id": "fedcba9876543210", "took": 10, "timed_out": false, "hits": { "total": 2, "hits": [ { "_id": "1", "timestamp": 1609780186, "sort": [1609780186, "1"] }, { "_id": "2", "timestamp": 1609780186, "sort": [1609780186, "2"] } ] } } """.replace("\n", "").replace(" ", "") # assume client never calls ?pretty :) large_page = (""" { "pit_id": "fedcba9876543210", "took": 10, "timed_out": false, "hits": { "total": 2, "hits": [""" + """ { "_id": "1", "timestamp": 1609780186, "sort": [1609780186, "1"] },""" * 100 + """ { "_id": "2", "timestamp": 1609780186, "sort": [1609780186, "2"] } ] } } """).replace("\n", "").replace(" ", "")