import re pat = re.compile(r"(\S+)|(<[^>]*>)") def scanner(text): for m in pat.finditer(text): token = m.group(0) print("Feeding:", repr(token)) yield token yield None text = " This is a foo file you know. " token_stream = scanner(text) def parse_elem(opening_tag): name = opening_tag[1:-1] closing_tag = "" % name items = parse_items(closing_tag) return (name, items) def parse_items(closing_tag = None): elems = [] while 1: token = next(token_stream) if not token: break # EOF if is_opening_tag(token): elems.append(parse_elem(token)) elif token == closing_tag: break else: elems.append(token) return elems def is_opening_tag(token): return token.startswith("<") and not token.startswith("