{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: cocoNLP in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.0.13)\n", "Requirement already satisfied: pyhanlp in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (0.1.62)\n", "Requirement already satisfied: jieba in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (0.42.1)\n", "Requirement already satisfied: arrow in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (0.15.5)\n", "Requirement already satisfied: regex in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (2020.2.20)\n", "Requirement already satisfied: phonenumbers in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (8.11.4)\n", "Requirement already satisfied: phone in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from cocoNLP) (0.4.2)\n", "Requirement already satisfied: jpype1==0.7.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from pyhanlp->cocoNLP) (0.7.0)\n", "Requirement already satisfied: python-dateutil in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from arrow->cocoNLP) (2.7.3)\n", "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from python-dateutil->arrow->cocoNLP) (1.11.0)\n", "\u001b[33mYou are using pip version 10.0.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: python-stdnum in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (1.13)\n", "\u001b[33mYou are using pip version 10.0.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already satisfied: textblob in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (0.15.3)\n", "Requirement already satisfied: nltk>=3.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from textblob) (3.3)\n", "Requirement already satisfied: six in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from nltk>=3.1->textblob) (1.11.0)\n", "\u001b[33mYou are using pip version 10.0.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install cocoNLP\n", "!pip install python-stdnum\n", "!pip install textblob" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1.Using it in full text process\n", " clean() function would do mask job\n", " It include two parameters: \n", " * text: you processed text\n", " * replace_with: the replace type:include `identifier` and `compliance`\n", "\n", "by default, it would process `card`,`cnaddress`,`cnid`,`cnname`,`credential`,`email`,`name`,`phone`,`url` mode, you can use add_detector() or remove_detector() to add or remove mode in it." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import desensitize" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "card\n", "cnaddress\n", "cnid\n", "cnname\n", "credential\n", "email\n", "name\n", "phone\n", "url\n" ] } ], "source": [ "de=desensitize.Desensitize()\n", "#de.remove_detector('email')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "text=u\"13725557496 contact Joe Duffy at joe@example.com 370304197709200630 4401250222189922 李伟住在深圳市南山区滨海之窗\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[('深圳市南山区滨海', 89, 97)]\n", "mobile para: 89 97 深圳市南山区滨海\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading model cost 0.728 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "李伟 nr 85 87\n", "chinese name para: 85 87 李伟\n", "mobile para: 0 11 13725557496\n" ] }, { "data": { "text/plain": [ "'{{PHONE}} contact {{NAME}} {{NAME}} at {{EMAIL}} {{CNID}} {{CARD}} {{CNNAME}}住在{{CNADDRESS}}之窗'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "de.clean(text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[('深圳市南山区滨海', 89, 97)]\n", "mobile para: 89 97 深圳市南山区滨海\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading model cost 0.724 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "李伟 nr 85 87\n", "chinese name para: 85 87 李伟\n", "mobile para: 0 11 13725557496\n", "['', 'a16a00b1f7d5e4e1b20a5b7517e17463', ' contact ', 'Joe', ' ', 'Duffy', ' at ', 'joe@example.com', ' ', '370***********0630', ' ', '************9922', ' ', '李*', '住在', '深圳市南山区**', '之窗']\n" ] }, { "data": { "text/plain": [ "'a16a00b1f7d5e4e1b20a5b7517e17463 contact Joe Duffy at joe@example.com 370***********0630 ************9922 李*住在深圳市南山区**之窗'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "de.clean(text, replace_with='compliance')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "de.clean(text, replace_with='identifier')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('深圳市南山区滨海', 89, 97)]\n", "mobile para: 89 97 深圳市南山区滨海\n", "李伟 nr 85 87\n", "chinese name para: 85 87 李伟\n", "mobile para: 0 11 13725557496\n", "['', '{{PHONE-0}}', ' contact ', '{{NAME-1}}', ' ', '{{NAME-2}}', ' at joe@example.com ', '{{CNID-3}}', ' ', '{{CARD-4}}', ' ', '{{CNNAME-5}}', '住在', '{{CNADDRESS-6}}', '之窗']\n" ] }, { "data": { "text/plain": [ "'{{PHONE-0}} contact {{NAME-1}} {{NAME-2}} at joe@example.com {{CNID-3}} {{CARD-4}} {{CNNAME-5}}住在{{CNADDRESS-6}}之窗'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "de.remove_detector('email')\n", "de.clean(text, replace_with='identifier')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2.Using it in csv process" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df=pd.read_csv(\"test.csv\")\n", "df = df.applymap(str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df= df.applymap(de.clean)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.save_csv(\"test1.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 4 }