# SPDX-License-Identifier: Apache-2.0 # The OpenSearch Contributors require contributions made to # this file be licensed under the Apache-2.0 license or a # compatible open source license. # Any modifications Copyright OpenSearch Contributors. See # GitHub history for details. # Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # File called _pytest for PyCharm compatability import pandas as pd import pytest from opensearch_py_ml.dataframe import DEFAULT_NUM_ROWS_DISPLAYED from tests.common import TestData, assert_pandas_opensearch_py_ml_series_equal class TestDataFrameRepr(TestData): @classmethod def setup_class(cls): # conftest.py changes this default - restore to original setting pd.set_option("display.max_rows", 60) """ to_string """ def test_simple_lat_lon(self): """ Note on nested object order - this can change when note this could be a bug in ES... PUT my_index/doc/1 { "location": { "lat": "50.033333", "lon": "8.570556" } } GET my_index/_search "_source": { "location": { "lat": "50.033333", "lon": "8.570556" } } GET my_index/_search { "_source": "location" } "_source": { "location": { "lon": "8.570556", "lat": "50.033333" } } Hence we store the pandas df source json as 'lon', 'lat' """ pd_dest_location = self.pd_flights()["DestLocation"].head(1) oml_dest_location = self.oml_flights()["DestLocation"].head(1) assert_pandas_opensearch_py_ml_series_equal( pd_dest_location, oml_dest_location, check_exact=False, rtol=2 ) def test_num_rows_to_string(self): # check setup works assert pd.get_option("display.max_rows") == 60 # Test opensearch_py_ml.DataFrame.to_string vs pandas.DataFrame.to_string # In pandas calling 'to_string' without max_rows set, will dump ALL rows # Test n-1, n, n+1 for edge cases self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED - 1) self.num_rows_to_string(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): # UserWarning displayed by opensearch_py_ml here (compare to pandas with max_rows set) self.num_rows_to_string( DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED ) # Test for where max_rows lt or gt num_rows self.num_rows_to_string(10, 5, 5) self.num_rows_to_string(100, 200, 200) def num_rows_to_string(self, rows, max_rows_eland=None, max_rows_pandas=None): oml_flights = self.oml_flights()[["DestLocation", "OriginLocation"]] pd_flights = self.pd_flights()[["DestLocation", "OriginLocation"]] oml_head = oml_flights.head(rows) pd_head = pd_flights.head(rows) oml_head_str = oml_head.to_string(max_rows=max_rows_eland) pd_head_str = pd_head.to_string(max_rows=max_rows_pandas) # print("\n", oml_head_str) # print("\n", pd_head_str) assert pd_head_str == oml_head_str def test_empty_dataframe_string(self): oml_ecom = self.oml_ecommerce() pd_ecom = self.pd_ecommerce() oml_ecom_s = oml_ecom[oml_ecom["currency"] == "USD"].to_string() pd_ecom_s = pd_ecom[pd_ecom["currency"] == "USD"].to_string() assert oml_ecom_s == pd_ecom_s """ repr """ def test_num_rows_repr(self): self.num_rows_repr( pd.get_option("display.max_rows") - 1, pd.get_option("display.max_rows") - 1 ) self.num_rows_repr( pd.get_option("display.max_rows"), pd.get_option("display.max_rows") ) self.num_rows_repr( pd.get_option("display.max_rows") + 1, pd.get_option("display.min_rows") ) def num_rows_repr(self, rows, num_rows_printed): oml_flights = self.oml_flights() pd_flights = self.pd_flights() oml_head = oml_flights.head(rows) pd_head = pd_flights.head(rows) oml_head_str = repr(oml_head) pd_head_str = repr(pd_head) if num_rows_printed < rows: # add 1 for ellipsis num_rows_printed = num_rows_printed + 1 # number of rows is num_rows_printed + 3 (header, summary) assert (num_rows_printed + 3) == len(oml_head_str.splitlines()) assert pd_head_str == oml_head_str def test_empty_dataframe_repr(self): oml_ecom = self.oml_ecommerce() pd_ecom = self.pd_ecommerce() # currently opensearch_py_ml will show dimensions no matter what if pd's display.show_dimensions option # is set to 'truncate'; this is a fairly minor issue which is difficult to fix # we ignore it for now old_option = pd.get_option("display.show_dimensions") pd.set_option("display.show_dimensions", True) oml_ecom_r = repr(oml_ecom[oml_ecom["currency"] == "USD"]) pd_ecom_r = repr(pd_ecom[pd_ecom["currency"] == "USD"]) print(oml_ecom_r) print(pd_ecom_r) assert oml_ecom_r == pd_ecom_r pd.set_option("display.show_dimensions", old_option) """ to_html """ def test_num_rows_to_html(self): # check setup works assert pd.get_option("display.max_rows") == 60 # Test opensearch_py_ml.DataFrame.to_string vs pandas.DataFrame.to_string # In pandas calling 'to_string' without max_rows set, will dump ALL rows # Test n-1, n, n+1 for edge cases self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED - 1) self.num_rows_to_html(DEFAULT_NUM_ROWS_DISPLAYED) with pytest.warns(UserWarning): # UserWarning displayed by opensearch_py_ml here self.num_rows_to_html( DEFAULT_NUM_ROWS_DISPLAYED + 1, None, DEFAULT_NUM_ROWS_DISPLAYED ) # Test for where max_rows lt or gt num_rows self.num_rows_to_html(10, 5, 5) self.num_rows_to_html(100, 200, 200) def num_rows_to_html(self, rows, max_rows_eland=None, max_rows_pandas=None): oml_flights = self.oml_flights() pd_flights = self.pd_flights() oml_head = oml_flights.head(rows) pd_head = pd_flights.head(rows) oml_head_str = oml_head.to_html(max_rows=max_rows_eland) pd_head_str = pd_head.to_html(max_rows=max_rows_pandas) # print(oml_head_str) # print(pd_head_str) assert pd_head_str == oml_head_str def test_empty_dataframe_to_html(self): oml_ecom = self.oml_ecommerce() pd_ecom = self.pd_ecommerce() oml_ecom_h = oml_ecom[oml_ecom["currency"] == "USD"].to_html() pd_ecom_h = pd_ecom[pd_ecom["currency"] == "USD"].to_html() assert oml_ecom_h == pd_ecom_h """ _repr_html_ """ def test_num_rows_repr_html(self): # check setup works assert pd.get_option("display.max_rows") == 60 show_dimensions = pd.get_option("display.show_dimensions") try: # TODO - there is a bug in 'show_dimensions' as it gets added after the last # For now test without this pd.set_option("display.show_dimensions", False) # Test opensearch_py_ml.DataFrame.to_string vs pandas.DataFrame.to_string # In pandas calling 'to_string' without max_rows set, will dump ALL rows # Test n-1, n, n+1 for edge cases self.num_rows_repr_html(pd.get_option("display.max_rows") - 1) self.num_rows_repr_html(pd.get_option("display.max_rows")) self.num_rows_repr_html( pd.get_option("display.max_rows") + 1, pd.get_option("display.max_rows") ) finally: # Restore default pd.set_option("display.show_dimensions", show_dimensions) def test_num_rows_repr_html_display_none(self): display = pd.get_option("display.notebook_repr_html") try: pd.set_option("display.notebook_repr_html", False) self.num_rows_repr_html(pd.get_option("display.max_rows")) finally: # Restore default pd.set_option("display.notebook_repr_html", display) def num_rows_repr_html(self, rows, max_rows=None): oml_flights = self.oml_flights() pd_flights = self.pd_flights() oml_head = oml_flights.head(rows) pd_head = pd_flights.head(rows) oml_head_str = oml_head._repr_html_() pd_head_str = pd_head._repr_html_() assert pd_head_str == oml_head_str def test_empty_dataframe_repr_html(self): # TODO - there is a bug in 'show_dimensions' as it gets added after the last # For now test without this show_dimensions = pd.get_option("display.show_dimensions") try: pd.set_option("display.show_dimensions", False) oml_ecom = self.oml_ecommerce() pd_ecom = self.pd_ecommerce() oml_ecom_rh = oml_ecom[oml_ecom["currency"] == "USD"]._repr_html_() pd_ecom_rh = pd_ecom[pd_ecom["currency"] == "USD"]._repr_html_() assert oml_ecom_rh == pd_ecom_rh finally: # Restore default pd.set_option("display.show_dimensions", show_dimensions) def test_dataframe_repr_pd_get_option_none(self): show_dimensions = pd.get_option("display.show_dimensions") show_rows = pd.get_option("display.max_rows") expand_frame = pd.get_option("display.expand_frame_repr") try: pd.set_option("display.show_dimensions", False) pd.set_option("display.max_rows", None) pd.set_option("display.expand_frame_repr", False) columns = [ "AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry", ] oml_flights = self.oml_flights().filter(columns).head(40).__repr__() pd_flights = self.pd_flights().filter(columns).head(40).__repr__() assert oml_flights == pd_flights finally: # Restore default pd.set_option("display.max_rows", show_rows) pd.set_option("display.show_dimensions", show_dimensions) pd.set_option("display.expand_frame_repr", expand_frame)