import re # options for the indentation of text inside of xml or html nodes # solving issue https://github.com/leforestier/yattag/issues/38 # while maintaining compatibility with older versions of yattag NO = False FIRST_LINE = True EACH_LINE = 2 __all__ = ['indent', 'NO', 'FIRST_LINE', 'EACH_LINE'] class TokenMeta(type): _token_classes = {} def __new__(cls, name, bases, attrs): kls = type.__new__(cls, name, bases, attrs) cls._token_classes[name] = kls return kls @classmethod def getclass(cls, name): return cls._token_classes[name] # need to proceed that way for Python 2/3 compatility: TokenBase = TokenMeta('TokenBase', (object,), {}) class Token(TokenBase): regex = None def __init__(self, groupdict): self.content = groupdict[self.__class__.__name__] class Text(Token): regex = '[^<>]+' def __init__(self, *args, **kwargs): super(Text, self).__init__(*args, **kwargs) self._isblank = None @property def isblank(self): if self._isblank is None: self._isblank = not self.content.strip() return self._isblank class Comment(Token): regex = r').)*.?-->' class CData(Token): regex = r').*).?\]\]>' class Doctype(Token): regex = r'''"']+|"[^"]*"|'[^']*'))*>''' _open_tag_start = r''' <\s* (?P<{tag_name_key}>{tag_name_rgx}) (\s+[^/><"=\s]+ # attribute (\s*=\s* ( [^/><"=\s]+ | # unquoted attribute value ("[^"]*") | # " quoted attribute value ('[^']*') # ' quoted attribute value ) )? # the attribute value is optional (we're forgiving) )* \s*''' class Script(Token): _end_script = r'<\s*/\s*script\s*>' regex = _open_tag_start.format( tag_name_key = 'script_ignore', tag_name_rgx = 'script', ) + r'>((?!({end_script})).)*.?{end_script}'.format( end_script = _end_script ) class Style(Token): _end_style = r'<\s*/\s*style\s*>' regex = _open_tag_start.format( tag_name_key = 'style_ignore', tag_name_rgx = 'style', ) + r'>((?!({end_style})).)*.?{end_style}'.format( end_style = _end_style ) class XMLDeclaration(Token): regex = _open_tag_start.format( tag_name_key = 'xmldecl_ignore', tag_name_rgx = r'\?\s*xml' ) + r'\?\s*>' class XMLProcessingInstruction(Token): regex = r'<\?(?!xml\s)[^?/><"\s]+(\s[^?>]*)?\?>' class NamedTagTokenMeta(TokenMeta): def __new__(cls, name, bases, attrs): kls = TokenMeta.__new__(cls, name, bases, attrs) if name not in('NamedTagTokenBase', 'NamedTagToken'): kls.tag_name_key = 'tag_name_%s' % name kls.regex = kls.regex_template.format( tag_name_key = kls.tag_name_key, tag_name_rgx = kls.tag_name_rgx ) return kls # need to proceed that way for Python 2/3 compatility NamedTagTokenBase = NamedTagTokenMeta( 'NamedTagTokenBase', (Token,), {'tag_name_rgx': r'[^?/><"\s]+'} ) class NamedTagToken(NamedTagTokenBase): def __init__(self, groupdict): super(NamedTagToken, self).__init__(groupdict) self.tag_name = groupdict[self.__class__.tag_name_key] class OpenTag(NamedTagToken): regex_template = _open_tag_start + '>' class SelfTag(NamedTagToken): # a self closing tag regex_template = _open_tag_start + r'/\s*>' class CloseTag(NamedTagToken): regex_template = r'<\s*/(?P<{tag_name_key}>{tag_name_rgx})(\s[^/><"]*)?>' class XMLTokenError(Exception): pass class Tokenizer(object): def __init__(self, token_classes): self.token_classes = token_classes self.token_names = [kls.__name__ for kls in token_classes] self.get_token = None def _compile_regex(self): self.get_token = re.compile( '|'.join( '(?P<%s>%s)' % (klass.__name__, klass.regex) for klass in self.token_classes ), re.X | re.I | re.S ).match def tokenize(self, string): if not self.get_token: self._compile_regex() result = [] append = result.append while string: mobj = self.get_token(string) if mobj: groupdict = mobj.groupdict() class_name = next(name for name in self.token_names if groupdict[name]) token = TokenMeta.getclass(class_name)(groupdict) append(token) string = string[len(token.content):] else: raise XMLTokenError("Unrecognized XML token near %s" % repr(string[:100])) return result tokenize = Tokenizer( (Text, Comment, CData, Doctype, XMLDeclaration, Script, Style, OpenTag, SelfTag, CloseTag, XMLProcessingInstruction) ).tokenize class TagMatcher(object): class SameNameMatcher(object): def __init__(self): self.unmatched_open = [] self.matched = {} def sigclose(self, i): if self.unmatched_open: open_tag = self.unmatched_open.pop() self.matched[open_tag] = i self.matched[i] = open_tag return open_tag else: return None def sigopen(self, i): self.unmatched_open.append(i) def __init__(self, token_list, blank_is_text = False): self.token_list = token_list self.name_matchers = {} self.direct_text_parents = set() for i in range(len(token_list)): token = token_list[i] tpe = type(token) if tpe is OpenTag: self._get_name_matcher(token.tag_name).sigopen(i) elif tpe is CloseTag: self._get_name_matcher(token.tag_name).sigclose(i) # TODO move this somewhere else current_nodes = [] for i in range(len(token_list)): token = token_list[i] tpe = type(token) if tpe is OpenTag and self.ismatched(i): current_nodes.append(i) elif tpe is CloseTag and self.ismatched(i): current_nodes.pop() elif tpe is Text and (blank_is_text or not token.isblank): if current_nodes: self.direct_text_parents.add(current_nodes[-1]) def _get_name_matcher(self, tag_name): try: return self.name_matchers[tag_name] except KeyError: self.name_matchers[tag_name] = name_matcher = self.__class__.SameNameMatcher() return name_matcher def ismatched(self, i): return i in self.name_matchers[self.token_list[i].tag_name].matched def directly_contains_text(self, i): return i in self.direct_text_parents new_line_rgx= re.compile(r'(\r?\n)', flags = re.MULTILINE) def indent(string, indentation = ' ', newline = '\n', indent_text = NO, blank_is_text = False): """ takes a string representing a html or xml document and returns a well indented version of it arguments: - string: the string to process - indentation: the indentation unit (default to two spaces) - newline: the string to be use for new lines (default to '\\n', could be set to '\\r\\n' for example) - indent_text: the value of this option should one of yattag.NO, yattag.FIRST_LINE or yattag.EACH_LINE if indent_text is NO, text nodes won't be indented, and the content of any node directly containing text will be unchanged:

Hello

will be unchanged

Hello world!

will be unchanged since ' world!' is directly contained in the

node. This is the default since that's generally what you want for HTML. if indent_text is FIRST_LINE, the first line of text nodes will be indented:

Hello

would result in

hello

and:

Hello, where are the keys?

would result in

hello, where are the keys?

if indent_text is EACH_LINE, each line inside the text nodes will be indented: object HelloWorld { def main(args: Array[String]) { println("Hello, world!") } } would result in object HelloWorld { def main(args: Array[String]) { println("Hello, world!") } } - blank_is_text: if False, completely blank texts are ignored. That is the default. """ tokens = tokenize(string) tag_matcher = TagMatcher(tokens, blank_is_text = blank_is_text) ismatched = tag_matcher.ismatched directly_contains_text = tag_matcher.directly_contains_text result = [] append = result.append level = 0 sameline = 0 was_just_opened = False tag_appeared = False def _indent(): if tag_appeared: append(newline) for i in range(level): append(indentation) def _append_text(text): if not sameline: _indent() if indent_text is EACH_LINE: append(new_line_rgx.sub(r'\1' + indentation * level, text)) else: append(text) for i,token in enumerate(tokens): tpe = type(token) if tpe is Text: if blank_is_text or not token.isblank: _append_text(token.content) was_just_opened = False elif tpe is OpenTag and ismatched(i): was_just_opened = True if sameline: sameline += 1 else: _indent() if indent_text is NO and directly_contains_text(i): sameline = sameline or 1 append(token.content) level += 1 tag_appeared = True elif tpe is CloseTag and ismatched(i): level -= 1 tag_appeared = True if sameline: sameline -= 1 elif not was_just_opened: _indent() append(token.content) was_just_opened = False else: if not sameline: _indent() append(token.content) was_just_opened = False tag_appeared = True return ''.join(result) if __name__ == '__main__': import sys print(indent(sys.stdin.read()))