1  __doc__ = """External interface to the BeautifulSoup HTML parser. 
  2  """ 
  3   
  4  __all__ = ["fromstring", "parse", "convert_tree"] 
  5   
  6  from lxml import etree, html 
  7  from BeautifulSoup import \ 
  8       BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString 
  9   
 10   
 11 -def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): 
  12      """Parse a string of HTML data into an Element tree using the 
 13      BeautifulSoup parser. 
 14   
 15      Returns the root ``<html>`` Element of the tree. 
 16   
 17      You can pass a different BeautifulSoup parser through the 
 18      `beautifulsoup` keyword, and a diffent Element factory function 
 19      through the `makeelement` keyword.  By default, the standard 
 20      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 21      used. 
 22      """ 
 23      return _parse(data, beautifulsoup, makeelement, **bsargs) 
  24   
 25 -def parse(file, beautifulsoup=None, makeelement=None, **bsargs): 
  26      """Parse a file into an ElemenTree using the BeautifulSoup parser. 
 27   
 28      You can pass a different BeautifulSoup parser through the 
 29      `beautifulsoup` keyword, and a diffent Element factory function 
 30      through the `makeelement` keyword.  By default, the standard 
 31      ``BeautifulSoup`` class and the default factory of `lxml.html` are 
 32      used. 
 33      """ 
 34      if not hasattr(file, 'read'): 
 35          file = open(file) 
 36      root = _parse(file, beautifulsoup, makeelement, **bsargs) 
 37      return etree.ElementTree(root) 
  38   
 55   
 56   
 57   
 58   
 59 -def _parse(source, beautifulsoup, makeelement, **bsargs): 
  73   
 79   
 81      SubElement = etree.SubElement 
 82      et_child = None 
 83      for child in beautiful_soup_tree: 
 84          if isinstance(child, Tag): 
 85              et_child = SubElement(parent, child.name, attrib=dict( 
 86                  [(k, unescape(v)) for (k,v) in child.attrs])) 
 87              _convert_children(et_child, child, makeelement) 
 88          elif type(child) is NavigableString: 
 89              _append_text(parent, et_child, unescape(child)) 
 90          else: 
 91              if isinstance(child, Comment): 
 92                  parent.append(etree.Comment(child)) 
 93              elif isinstance(child, ProcessingInstruction): 
 94                  parent.append(etree.ProcessingInstruction( 
 95                      *child.split(' ', 1))) 
 96              else:  
 97                  _append_text(parent, et_child, unescape(child)) 
  98   
 99 -def _append_text(parent, element, text): 
 100      if element is None: 
101          parent.text = (parent.text or '') + text 
102      else: 
103          element.tail = (element.tail or '') + text 
 104   
105   
106   
107   
108  try: 
109      from html.entities import name2codepoint  
110  except ImportError: 
111      from htmlentitydefs import name2codepoint 
112  import re 
113   
114  handle_entities = re.compile("&(\w+);").sub 
115   
117      if not string: 
118          return '' 
119       
120      def unescape_entity(m): 
121          try: 
122              return unichr(name2codepoint[m.group(1)]) 
123          except KeyError: 
124              return m.group(0)  
 125      return handle_entities(unescape_entity, string) 
126