import re USStateAbbreviation = { "ALABAMA": "AL", "ALASKA": "AK", "ARIZONA": "AZ", "ARKANSAS": "AR", "CALIFORNIA": "CA", "COLORADO": "CO", "CONNECTICUT": "CT", "DELAWARE": "DE", "FLORIDA": "FL", "GEORGIA": "GA", "HAWAII": "HI", "IDAHO": "ID", "ILLINOIS": "IL", "INDIANA": "IN", "IOWA": "IA", "KANSAS": "KS", "KENTUCKY": "KY", "LOUISIANA": "LA", "MAINE": "ME", "MARYLAND": "MD", "MASSACHUSETTS": "MA", "MICHIGAN": "MI", "MINNESOTA": "MN", "MISSISSIPPI": "MS", "MISSOURI": "MO", "MONTANA": "MT", "NEBRASKA": "NE", "NEVADA": "NV", "NEWHAMPSHIRE": "NH", "NEWJERSEY": "NJ", "NEWMEXICO": "NM", "NEWYORK": "NY", "NORTHCAROLINA": "NC", "NORTHDAKOTA": "ND", "OHIO": "OH", "OKLAHOMA": "OK", "OREGON": "OR", "PENNSYLVANIA": "PA", "RHODEISLAND": "RI", "SOUTHCAROLINA": "SC", "SOUTHDAKOTA": "SD", "TENNESSEE": "TN", "TEXAS": "TX", "UTAH": "UT", "VERMONT": "VT", "VIRGINIA": "VA", "WASHINGTON": "WA", "WESTVIRGINIA": "WV", "WISCONSIN": "WI", "WYOMING": "WY", } FRStateAbbreviation = { "ALSACE": "AA", "AQUITAINE": "AQ", "AUVERGNE": "AU", "BRITTANY": "BT", "BURGUNDY": "BG", "CENTRE": "CN", "CHAMPAGNEARDENNE": "CG", "FRANCHECOMTE": "FC", "ILEDEFRANCE": "IF", "LANGUEDOCROUSSILLON": "LU", "LIMOUSIN": "LM", "LORRAINE": "LE", "LOWERNORMANDY": "BN", "MIDIPYRENEES": "MP", "NORDPASDECALAIS": "NP", "PAYSDELALOIRE": "PL", "PICARDY": "PI", "POITOUCHARENTES": "PT", "PROVENCEALPESCOTEDAZUR": "PR", "RHONEALPES": "RA", "UPPERNORMANDY": "HT", "CORSICA": "CE", } CAStateAbbreviation = { "ALBERTA": "AB", "BRITISHCOLUMBIA": "BC", "MANITOBA": "MB", "NEWBRUNSWICK": "NB", "NEWFOUNDLANDANDLABRADOR": "NL", "NORTHWESTTERRITORIES": "NT", "NOVASCOTIA": "NS", "NUNAVUT": "NU", "ONTARIO": "ON", "PRINCEEDWARDISLAND": "PE", "QUEBEC": "QC", "SASKATCHEWAN": "SK", "YUKON": "YT", } DEStateAbbreviation = { "BADENWUERTTEMBERG": "BW", "BAVARIA": "BY", "BERLIN": "BE", "BRANDENBURG": "BB", "BREMEN": "HB", "HAMBURG": "HH", "HESSE": "HE", "LOWERSAXONY": "NI", "MECKLENBURGVORPOMMERN": "MV", "NORTHRHINEWESTPHALIA": "NW", "RHINELANDPALATINATE": "RP", "SAARLAND": "SL", "SAXONY": "SN", "SAXONYANHALT": "ST", "SCHLESWIGHOLSTEIN": "SH", "THURINGIA": "TH", } ITStateAbbreviation = { "AGRIGENTO": "AG", "ALESSANDRIA": "AL", "ANCONA": "AN", "AOSTA": "AO", "AREZZO": "AR", "ASCOLIPICENO": "AP", "ASTI": "AT", "AVELLINO": "AV", "BARI": "BA", "BARLETTAANDRIATRANI": "BT", "BELLUNO": "BL", "BENEVENTO": "BN", "BERGAMO": "BG", "BIELLA": "BI", "BOLOGNA": "BO", "SOUTHTYROL": "BZ", "BRESCIA": "BS", "BRINDISI": "BR", "CAGLIARI": "CA", "CALTANISSETTA": "CL", "CAMPOBASSO": "CB", "CARBONIAIGLESIAS": "CI", "CASERTA": "CE", "CATANIA": "CT", "CATANZARO": "CZ", "CHIETI": "CH", "COMO": "CO", "COSENZA": "CS", "CREMONA": "CR", "CROTONE": "KR", "CUNEO": "CN", "ENNA": "EN", "FERMO": "FM", "FERRARA": "FE", "FLORENCE": "FI", "FOGGIA": "FG", "FORLICESENA": "FC", "FROSINONE": "FR", "GENOA": "GE", "GORIZIA": "GO", "GROSSETO": "GR", "IMPERIA": "IM", "ISERNIA": "IS", "LASPEZIA": "SP", "LAQUILA": "AQ", "LATINA": "LT", "LECCE": "LE", "LECCO": "LC", "LIVORNO": "LI", "LODI": "LO", "LUCCA": "LU", "MACERATA": "MC", "MANTUA": "MN", "MASSAANDCARRARA": "MS", "MATERA": "MT", "MEDIOCAMPIDANO": "VS", "MESSINA": "ME", "MILAN": "MI", "MODENA": "MO", "MONZAANDBRIANZA": "MB", "NAPLES": "NA", "NOVARA": "NO", "NUORO": "NU", "OGLIASTRA": "OG", "OLBIATEMPIO": "OT", "ORISTANO": "OR", "PADUA": "PD", "PALERMO": "PA", "PARMA": "PR", "PAVIA": "PV", "PERUGIA": "PG", "PESAROANDURBINO": "PU", "PESCARA": "PE", "PIACENZA": "PC", "PISA": "PI", "PISTOIA": "PT", "PORDENONE": "PN", "POTENZA": "PZ", "PRATO": "PO", "RAGUSA": "RG", "RAVENNA": "RA", "REGGIOCALABRIA": "RC", "REGGIOEMILIA": "RE", "RIETI": "RI", "RIMINI": "RN", "ROME": "RM", "ROVIGO": "RO", "SALERNO": "SA", "SASSARI": "SS", "SAVONA": "SV", "SIENA": "SI", "SONDRIO": "SO", "SYRACUSE": "SR", "TARANTO": "TA", "TERAMO": "TE", "TERNI": "TR", "TRAPANI": "TP", "TRENTINO": "TN", "TREVISO": "TV", "TRIESTE": "TS", "TURIN": "TO", "UDINE": "UD", "VARESE": "VA", "VENICE": "VE", "VERBANOCUSIOOSSOLA": "VB", "VERCELLI": "VC", "VERONA": "VR", "VIBOVALENTIA": "VV", "VICENZA": "VI", "VITERBO": "VT", } ESStateAbbreviation = { "ALICANTE": "A", "ALACANT": "A", "ALBACETE": "AB", "ALMERIA": "AL", "AVILA": "AV", "BARCELONA": "B", "BADAJOZ": "BA", "VIZCAYA": "BI", "BIZKAIA": "BI", "BURGOS": "BU", "LACORUNA": "C", "ACORUNA": "C", "CADIZ": "CA", "CACERES": "CC", "CEUTA": "CE", "CORDOBA": "CO", "CIUDADREAL": "CR", "CASTELLON": "CS", "CASTELLO": "CS", "CUENCA": "CU", "LASPALMAS": "GC", "GIRONA": "GI", "GERONA": "GI", "GRANADA": "GR", "GUADALAJARA": "GU", "HUELVA": "H", "HUESCA": "HU", "JAEN": "J", "LERIDA": "L", "LLEIDA": "L", "LEON": "LE", "LARIOJA": "LO", "LUGO": "LU", "MADRID": "M", "MALAGA": "MA", "MELILLA": "ML", "MURCIA": "MU", "NAVARRA": "NA", "NAFARROA": "NA", "ASTURIAS": "O", "ORENSE": "OR", "OURENSE": "OR", "PALENCIA": "P", "BALEARES": "PM", "BALEARS": "PM", "PONTEVEDRA": "PO", "CANTABRIA": "S", "SALAMANCA": "SA", "SEVILLA": "SE", "SEGOVIA": "SG", "SORIA": "SO", "GUIPUZCOA": "SS", "GIPUZKOA": "SS", "TARRAGONA": "T", "TERUEL": "TE", "SANTACRUZDETENERIFE": "TF", "TOLEDO": "TO", "VALENCIA": "V", "VALLADOLID": "VA", "ALAVA": "VI", "ARABA": "VI", "ZARAGOZA": "Z", "ZAMORA": "ZA", } class StateNormalizer: def __init__(self, country_code): if country_code == "US": self.state_abbreviation_map = USStateAbbreviation elif country_code == "FR": self.state_abbreviation_map = FRStateAbbreviation elif country_code == "CA": self.state_abbreviation_map = CAStateAbbreviation elif country_code == "DE": self.state_abbreviation_map = DEStateAbbreviation elif country_code == "ES": self.state_abbreviation_map = ESStateAbbreviation elif country_code == "IT": self.state_abbreviation_map = ITStateAbbreviation # Countries without specific state abbreviations else: self.state_abbreviation_map = {} def normalize(self, record): self.normalized_record = record.upper() self.normalized_record = re.sub(r"[^A-Z]", "", self.normalized_record) if self.normalized_record in self.state_abbreviation_map: self.normalized_record = self.state_abbreviation_map.get( self.normalized_record ) elif len(record) > 2: self.normalized_record = self.normalized_record[:2] self.normalized_record = self.normalized_record.lower() return self