# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 import re _invalid_pattern = re.compile('<[\s]*(img|a|script|link|/img|/a|/script|/link)[^>]*>', re.IGNORECASE) _all_tags_regex = re.compile('<[^>]*>') def sanitize(text, new_value=''): """ Removes HTML tags that can be executed, used to download from or link to external resources. """ # if input type is anything but string, return as-is # not enforcing type in parameters since we don't want this to fail with a non string type if not isinstance(text, str): return text temp = text # First use a regular expression that matches ALL tags with or without attributes, white space, # dashes, slash, etc... for match in re.finditer(_all_tags_regex, text): matched_str = match.group(0) # Now match and replace the invalid tags pattern if re.search(_invalid_pattern, matched_str): temp = temp.replace(matched_str, new_value) return temp