import xml.sax import csv import json import re from bs4 import BeautifulSoup thedatawriter=None class PostsHandler(xml.sax.ContentHandler): def startElement(self, name, attrs): if name == "row": postType=int(attrs["PostTypeId"]) if postType <=2: record = {} # In some questions e.g. 10030718 the ownerID is missing and we have OwnerDisplayName instead ownerDisplayName='' ownerId="" user="" if "OwnerUserId" in attrs: ownerId=attrs["OwnerUserId"] record['user'] = ownerId elif "OwnerDisplayName" in attrs: ownerDisplayName=attrs["OwnerDisplayName"] record['user'] = ownerDisplayName tags =[] if "Tags" in attrs: tags=re.split("[<>]+", attrs["Tags"]) record['tags'] = [x for x in tags if len(x)>0] record['questionId'] = attrs["Id"] if postType ==2: # Answer posts have an ParentId that link to question post record['questionId'] = attrs["ParentId"] if postType ==1: if "CreationDate" in attrs: record['creationDate']=attrs["CreationDate"] if "Title" in attrs: record['title'] = attrs["Title"].replace("\n"," ").replace("\r"," ") if "AcceptedAnswerId" in attrs: record['acceptedAnswerId'] = attrs["AcceptedAnswerId"] record['type'] = "question" if postType ==2: if "CreationDate" in attrs: record['creationDate']=attrs["CreationDate"] if "Id" in attrs: record['answerId']=attrs["Id"] record['type'] = "answer" if "Body" in attrs: soup = BeautifulSoup(attrs["Body"], "html.parser") body = soup.get_text().replace("\n"," ").replace("\r","") body = re.sub("\s+", " ", body) record['body'] = body myjsonfile.write(json.dumps(record, separators=(',',':'))) myjsonfile.write("\n") parser = xml.sax.make_parser() print ("loading Posts") postsFilename="Posts.xml" with open('posts.json', 'w') as myjsonfile: parser.setContentHandler(PostsHandler()) parser.parse(open(postsFilename, "r"))