import re import argparse def remove_html_tag(html_string): """ remove html tag :param html_string: :return: """ return re.sub('<.*?>', '', html_string) def convert_html2FAQ(html_string): """ convert html to FAQ :param html_string: :return: question and answer tuple list input example : 满足什么条件才能进行全息幻视?
您好,全息幻视需要满足以下两个基本条件:
 
1)您已经拥有了想要幻视的目标城市外观,且当前城市外观和目标城市外观都需要是永久城市外观;
2)目标城市外观假如为可升级类型的城堡皮肤,需要达到最高等级后,才能够被幻视。
""" lines = html_string.split('\n') lines = [line.strip() for line in lines if line.strip() != ''] for line in lines: tuple = line.split('\t', 2) question, answer = tuple[0], tuple[1] answer = remove_html_tag(answer) yield f"Question: {question}", f"Answer: {answer}" def filter_makenosence_lines(conversion_content:str): lines = conversion_content.splitlines() filtered_lines = [ line for line in lines if line.startswith('@Jarvis') or line.startswith('亲爱的玩家') ] return '\n'.join(filtered_lines) """ implement a main entry, and use different flag to execute different functions """ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input_file', type=str, default='./JY_FAQ_Origin.txt', help='input file') parser.add_argument('--output_file', type=str, default='./JY_FAQ.txt', help='output file') parser.add_argument('--command', type=str, default='process_faq', help='use process_faq or process_queries') args = parser.parse_args() input_file = args.input_file output_file = args.output_file if args.command == 'process_faq': with open(output_file, 'w', encoding='utf-8') as out_f: with open(input_file, 'r', encoding='utf-8') as input_f: all_html = input_f.read() for question, answer in convert_html2FAQ(all_html): out_f.write(f"\n{question}\n{answer}\n") elif args.command == 'process_queries': with open(output_file, 'w', encoding='utf-8') as out_f: with open(input_file, 'r', encoding='utf-8') as input_f: conversation_content = input_f.read() filterd_content = filter_makenosence_lines(conversation_content) out_f.write(filterd_content)