1   
   2   
   3   
   4   
   5   
   6   
   7   
   8   
   9   
  10   
  11   
  12   
  13   
  14   
  15   
  16   
  17   
  18   
  19   
  20   
  21   
  22   
  23   
  24   
  25   
  26   
  27   
  28   
  29   
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  import sys 
  35  import re 
  36  try: 
  37      from urlparse import urljoin 
  38  except ImportError: 
  39       
  40      from urllib.parse import urljoin 
  41  import copy 
  42  from lxml import etree 
  43  from lxml.html import defs 
  44  from lxml.html._setmixin import SetMixin 
  45  try: 
  46      from collections import MutableMapping as DictMixin 
  47  except ImportError: 
  48       
  49      from UserDict import DictMixin 
  50  try: 
  51      set 
  52  except NameError: 
  53       
  54      from sets import Set as set 
  55  try: 
  56      bytes 
  57  except NameError: 
  58       
  59      bytes = str 
  60  try: 
  61      unicode 
  62  except NameError: 
  63       
  64      unicode = str 
  65  try: 
  66      basestring 
  67  except NameError: 
  68       
  69      basestring = (str, bytes) 
  70   
  72      if not s: 
  73          return s 
  74      import sys 
  75      if sys.version_info[0] >= 3: 
  76          sub = re.compile(r"^(\s*)u'", re.M).sub 
  77      else: 
  78          sub = re.compile(r"^(\s*)b'", re.M).sub 
  79      return sub(r"\1'", s) 
   80   
  81  __all__ = [ 
  82      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  83      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  84      'find_rel_links', 'find_class', 'make_links_absolute', 
  85      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 
  86   
  87  XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 
  88   
  89  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 
  90                                 namespaces={'x':XHTML_NAMESPACE}) 
  91  _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 
  92                               namespaces={'x':XHTML_NAMESPACE}) 
  93  _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 
  94                             namespaces={'x':XHTML_NAMESPACE}) 
  95   
  96  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  97  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  98  _collect_string_content = etree.XPath("string()") 
  99  _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 
 100  _css_import_re = re.compile(r'@import "(.*?)"') 
 101  _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 
 102                             namespaces={'x':XHTML_NAMESPACE}) 
 103  _archive_re = re.compile(r'[^ ]+') 
 104   
 106      if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 
 107          return s[1:-1], pos+1 
 108      else: 
 109          return s,pos 
  110   
 120   
 126   
 128   
 130          """ 
 131          Returns the base URL, given when the page was parsed. 
 132   
 133          Use with ``urlparse.urljoin(el.base_url, href)`` to get 
 134          absolute URLs. 
 135          """ 
 136          return self.getroottree().docinfo.URL 
  137      base_url = property(base_url, doc=base_url.__doc__) 
 138   
 144      forms = property(forms, doc=forms.__doc__) 
 145   
 147          """ 
 148          Return the <body> element.  Can be called from a child element 
 149          to get the document's head. 
 150          """ 
 151          return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] 
  152      body = property(body, doc=body.__doc__) 
 153   
 155          """ 
 156          Returns the <head> element.  Can be called from a child 
 157          element to get the document's head. 
 158          """ 
 159          return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] 
  160      head = property(head, doc=head.__doc__) 
 161   
 163          """ 
 164          Get or set any <label> element associated with this element. 
 165          """ 
 166          id = self.get('id') 
 167          if not id: 
 168              return None 
 169          result = _label_xpath(self, id=id) 
 170          if not result: 
 171              return None 
 172          else: 
 173              return result[0] 
  175          id = self.get('id') 
 176          if not id: 
 177              raise TypeError( 
 178                  "You cannot set a label for an element (%r) that has no id" 
 179                  % self) 
 180          if _nons(label.tag) != 'label': 
 181              raise TypeError( 
 182                  "You can only assign label to a label element (not %r)" 
 183                  % label) 
 184          label.set('for', id) 
  189      label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 
 190   
 192          """ 
 193          Removes this element from the tree, including its children and 
 194          text.  The tail text is joined to the previous element or 
 195          parent. 
 196          """ 
 197          parent = self.getparent() 
 198          assert parent is not None 
 199          if self.tail: 
 200              previous = self.getprevious() 
 201              if previous is None: 
 202                  parent.text = (parent.text or '') + self.tail 
 203              else: 
 204                  previous.tail = (previous.tail or '') + self.tail 
 205          parent.remove(self) 
  206   
 208          """ 
 209          Remove the tag, but not its children or text.  The children and text 
 210          are merged into the parent. 
 211   
 212          Example:: 
 213   
 214              >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 
 215              >>> h.find('.//b').drop_tag() 
 216              >>> print(tostring(h, encoding=unicode)) 
 217              <div>Hello World!</div> 
 218          """ 
 219          parent = self.getparent() 
 220          assert parent is not None 
 221          previous = self.getprevious() 
 222          if self.text and isinstance(self.tag, basestring): 
 223               
 224              if previous is None: 
 225                  parent.text = (parent.text or '') + self.text 
 226              else: 
 227                  previous.tail = (previous.tail or '') + self.text 
 228          if self.tail: 
 229              if len(self): 
 230                  last = self[-1] 
 231                  last.tail = (last.tail or '') + self.tail 
 232              elif previous is None: 
 233                  parent.text = (parent.text or '') + self.tail 
 234              else: 
 235                  previous.tail = (previous.tail or '') + self.tail 
 236          index = parent.index(self) 
 237          parent[index:index+1] = self[:] 
  238   
 240          """ 
 241          Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 
 242          """ 
 243          rel = rel.lower() 
 244          return [el for el in _rel_links_xpath(self) 
 245                  if el.get('rel').lower() == rel] 
  246   
 248          """ 
 249          Find any elements with the given class name. 
 250          """ 
 251          return _class_xpath(self, class_name=class_name) 
  252   
 254          """ 
 255          Get the first element in a document with the given id.  If none is 
 256          found, return the default argument if provided or raise KeyError 
 257          otherwise. 
 258   
 259          Note that there can be more than one element with the same id, 
 260          and this isn't uncommon in HTML documents found in the wild. 
 261          Browsers return only the first match, and this function does 
 262          the same. 
 263          """ 
 264          try: 
 265               
 266               
 267              return _id_xpath(self, id=id)[0] 
 268          except IndexError: 
 269              if default: 
 270                  return default[0] 
 271              else: 
 272                  raise KeyError(id) 
  273   
 274 -    def text_content(self): 
  275          """ 
 276          Return the text content of the tag (and the text in any children). 
 277          """ 
 278          return _collect_string_content(self) 
  279   
 280 -    def cssselect(self, expr, translator='html'): 
  281          """ 
 282          Run the CSS expression on this element and its children, 
 283          returning a list of the results. 
 284   
 285          Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 
 286          -- note that pre-compiling the expression can provide a substantial 
 287          speedup. 
 288          """ 
 289           
 290          from lxml.cssselect import CSSSelector 
 291          return CSSSelector(expr, translator=translator)(self) 
  292   
 293       
 294       
 295       
 296   
 297 -    def make_links_absolute(self, base_url=None, resolve_base_href=True, 
 298                              handle_failures=None): 
  299          """ 
 300          Make all links in the document absolute, given the 
 301          ``base_url`` for the document (the full URL where the document 
 302          came from), or if no ``base_url`` is given, then the ``.base_url`` 
 303          of the document. 
 304   
 305          If ``resolve_base_href`` is true, then any ``<base href>`` 
 306          tags in the document are used *and* removed from the document. 
 307          If it is false then any such tag is ignored. 
 308   
 309          If ``handle_failures`` is None (default), a failure to process 
 310          a URL will abort the processing.  If set to 'ignore', errors 
 311          are ignored.  If set to 'discard', failing URLs will be removed. 
 312          """ 
 313          if base_url is None: 
 314              base_url = self.base_url 
 315              if base_url is None: 
 316                  raise TypeError( 
 317                      "No base_url given, and the document has no base_url") 
 318          if resolve_base_href: 
 319              self.resolve_base_href() 
 320   
 321          if handle_failures == 'ignore': 
 322              def link_repl(href): 
 323                  try: 
 324                      return urljoin(base_url, href) 
 325                  except ValueError: 
 326                      return href 
  327          elif handle_failures == 'discard': 
 328              def link_repl(href): 
 329                  try: 
 330                      return urljoin(base_url, href) 
 331                  except ValueError: 
 332                      return None 
  333          elif handle_failures is None: 
 334              def link_repl(href): 
 335                  return urljoin(base_url, href) 
 336          else: 
 337              raise ValueError( 
 338                  "unexpected value for handle_failures: %r" % handle_failures) 
 339   
 340          self.rewrite_links(link_repl) 
 341   
 343          """ 
 344          Find any ``<base href>`` tag in the document, and apply its 
 345          values to all links found in the document.  Also remove the 
 346          tag once it has been applied. 
 347   
 348          If ``handle_failures`` is None (default), a failure to process 
 349          a URL will abort the processing.  If set to 'ignore', errors 
 350          are ignored.  If set to 'discard', failing URLs will be removed. 
 351          """ 
 352          base_href = None 
 353          basetags = self.xpath('//base[@href]|//x:base[@href]', 
 354                                namespaces={'x': XHTML_NAMESPACE}) 
 355          for b in basetags: 
 356              base_href = b.get('href') 
 357              b.drop_tree() 
 358          if not base_href: 
 359              return 
 360          self.make_links_absolute(base_href, resolve_base_href=False, 
 361                                   handle_failures=handle_failures) 
  362   
 364          """ 
 365          Yield (element, attribute, link, pos), where attribute may be None 
 366          (indicating the link is in the text).  ``pos`` is the position 
 367          where the link occurs; often 0, but sometimes something else in 
 368          the case of links in stylesheets or style tags. 
 369   
 370          Note: <base href> is *not* taken into account in any way.  The 
 371          link you get is exactly the link in the document. 
 372   
 373          Note: multiple links inside of a single text string or 
 374          attribute value are returned in reversed order.  This makes it 
 375          possible to replace or delete them from the text string value 
 376          based on their reported text positions.  Otherwise, a 
 377          modification at one text position can change the positions of 
 378          links reported later on. 
 379          """ 
 380          link_attrs = defs.link_attrs 
 381          for el in self.iter(): 
 382              attribs = el.attrib 
 383              tag = _nons(el.tag) 
 384              if tag != 'object': 
 385                  for attrib in link_attrs: 
 386                      if attrib in attribs: 
 387                          yield (el, attrib, attribs[attrib], 0) 
 388              elif tag == 'object': 
 389                  codebase = None 
 390                   
 391                   
 392                  if 'codebase' in attribs: 
 393                      codebase = el.get('codebase') 
 394                      yield (el, 'codebase', codebase, 0) 
 395                  for attrib in 'classid', 'data': 
 396                      if attrib in attribs: 
 397                          value = el.get(attrib) 
 398                          if codebase is not None: 
 399                              value = urljoin(codebase, value) 
 400                          yield (el, attrib, value, 0) 
 401                  if 'archive' in attribs: 
 402                      for match in _archive_re.finditer(el.get('archive')): 
 403                          value = match.group(0) 
 404                          if codebase is not None: 
 405                              value = urljoin(codebase, value) 
 406                          yield (el, 'archive', value, match.start()) 
 407              if tag == 'param': 
 408                  valuetype = el.get('valuetype') or '' 
 409                  if valuetype.lower() == 'ref': 
 410                       
 411                       
 412                       
 413                       
 414                       
 415                       
 416                      yield (el, 'value', el.get('value'), 0) 
 417              if tag == 'style' and el.text: 
 418                  urls = [ 
 419                      _unquote_match(match.group(1), match.start(1)) 
 420                      for match in _css_url_re.finditer(el.text) 
 421                      ] + [ 
 422                      (match.group(1), match.start(1)) 
 423                      for match in _css_import_re.finditer(el.text) 
 424                      ] 
 425                  if urls: 
 426                       
 427                      urls = [ (start, url) for (url, start) in urls ] 
 428                      urls.sort() 
 429                       
 430                       
 431                      urls.reverse() 
 432                      for start, url in urls: 
 433                          yield (el, None, url, start) 
 434              if 'style' in attribs: 
 435                  urls = list(_css_url_re.finditer(attribs['style'])) 
 436                  if urls: 
 437                       
 438                      for match in urls[::-1]: 
 439                          url, start = _unquote_match(match.group(1), match.start(1)) 
 440                          yield (el, 'style', url, start) 
  441   
 442 -    def rewrite_links(self, link_repl_func, resolve_base_href=True, 
 443                        base_href=None): 
  444          """ 
 445          Rewrite all the links in the document.  For each link 
 446          ``link_repl_func(link)`` will be called, and the return value 
 447          will replace the old link. 
 448   
 449          Note that links may not be absolute (unless you first called 
 450          ``make_links_absolute()``), and may be internal (e.g., 
 451          ``'#anchor'``).  They can also be values like 
 452          ``'mailto:email'`` or ``'javascript:expr'``. 
 453   
 454          If you give ``base_href`` then all links passed to 
 455          ``link_repl_func()`` will take that into account. 
 456   
 457          If the ``link_repl_func`` returns None, the attribute or 
 458          tag text will be removed completely. 
 459          """ 
 460          if base_href is not None: 
 461               
 462               
 463              self.make_links_absolute( 
 464                  base_href, resolve_base_href=resolve_base_href) 
 465          elif resolve_base_href: 
 466              self.resolve_base_href() 
 467   
 468          for el, attrib, link, pos in self.iterlinks(): 
 469              new_link = link_repl_func(link.strip()) 
 470              if new_link == link: 
 471                  continue 
 472              if new_link is None: 
 473                   
 474                  if attrib is None: 
 475                      el.text = '' 
 476                  else: 
 477                      del el.attrib[attrib] 
 478                  continue 
 479   
 480              if attrib is None: 
 481                  new = el.text[:pos] + new_link + el.text[pos+len(link):] 
 482                  el.text = new 
 483              else: 
 484                  cur = el.get(attrib) 
 485                  if not pos and len(cur) == len(link): 
 486                      new = new_link   
 487                  else: 
 488                      new = cur[:pos] + new_link + cur[pos+len(link):] 
 489                  el.set(attrib, new) 
  490   
 491   
 493      """ 
 494      An object that represents a method on an element as a function; 
 495      the function takes either an element or an HTML string.  It 
 496      returns whatever the function normally returns, or if the function 
 497      works in-place (and so returns None) it returns a serialized form 
 498      of the resulting document. 
 499      """ 
 505          result_type = type(doc) 
 506          if isinstance(doc, basestring): 
 507              if 'copy' in kw: 
 508                  raise TypeError( 
 509                      "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 
 510              doc = fromstring(doc, **kw) 
 511          else: 
 512              if 'copy' in kw: 
 513                  make_a_copy = kw.pop('copy') 
 514              else: 
 515                  make_a_copy = self.copy 
 516              if make_a_copy: 
 517                  doc = copy.deepcopy(doc) 
 518          meth = getattr(doc, self.name) 
 519          result = meth(*args, **kw) 
 520           
 521          if result is None: 
 522               
 523              return _transform_result(result_type, doc) 
 524          else: 
 525              return result 
   526   
 527  find_rel_links = _MethodFunc('find_rel_links', copy=False) 
 528  find_class = _MethodFunc('find_class', copy=False) 
 529  make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 
 530  resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 
 531  iterlinks = _MethodFunc('iterlinks', copy=False) 
 532  rewrite_links = _MethodFunc('rewrite_links', copy=True) 
 533   
 536   
 539   
 542   
 545   
 546   
 548      """A lookup scheme for HTML Element classes. 
 549   
 550      To create a lookup instance with different Element classes, pass a tag 
 551      name mapping of Element classes in the ``classes`` keyword argument and/or 
 552      a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 
 553      The special key '*' denotes a Mixin class that should be mixed into all 
 554      Element classes. 
 555      """ 
 556      _default_element_classes = {} 
 557   
 558 -    def __init__(self, classes=None, mixins=None): 
  575   
 576 -    def lookup(self, node_type, document, namespace, name): 
   587   
 588   
 589   
 590   
 591   
 592  _looks_like_full_html_unicode = re.compile( 
 593      unicode(r'^\s*<(?:html|!doctype)'), re.I).match 
 594  _looks_like_full_html_bytes = re.compile( 
 595      r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 
 596   
 605   
 608      """ 
 609      Parses several HTML elements, returning a list of elements. 
 610   
 611      The first item in the list may be a string (though leading 
 612      whitespace is removed).  If no_leading_text is true, then it will 
 613      be an error if there is leading text, and it will always be a list 
 614      of only elements. 
 615   
 616      base_url will set the document's base_url attribute (and the tree's docinfo.URL) 
 617      """ 
 618      if parser is None: 
 619          parser = html_parser 
 620       
 621      if isinstance(html, bytes): 
 622          if not _looks_like_full_html_bytes(html): 
 623              html = '<html><body>%s</body></html>'.encode('ascii') % html 
 624      else: 
 625          if not _looks_like_full_html_unicode(html): 
 626              html = '<html><body>%s</body></html>' % html 
 627      doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 
 628      assert _nons(doc.tag) == 'html' 
 629      bodies = [e for e in doc if _nons(e.tag) == 'body'] 
 630      assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 
 631      body = bodies[0] 
 632      elements = [] 
 633      if no_leading_text and body.text and body.text.strip(): 
 634          raise etree.ParserError( 
 635              "There is leading text: %r" % body.text) 
 636      if body.text and body.text.strip(): 
 637          elements.append(body.text) 
 638      elements.extend(body) 
 639       
 640       
 641      return elements 
  642   
 645      """ 
 646      Parses a single HTML element; it is an error if there is more than 
 647      one element, or if anything but whitespace precedes or follows the 
 648      element. 
 649   
 650      If create_parent is true (or is a tag name) then a parent node 
 651      will be created to encapsulate the HTML in a single element.  In 
 652      this case, leading or trailing text is allowed. 
 653   
 654      base_url will set the document's base_url attribute (and the tree's docinfo.URL) 
 655      """ 
 656      if parser is None: 
 657          parser = html_parser 
 658   
 659      accept_leading_text = bool(create_parent) 
 660   
 661      elements = fragments_fromstring( 
 662          html, parser=parser, no_leading_text=not accept_leading_text, 
 663          base_url=base_url, **kw) 
 664   
 665      if create_parent: 
 666          if not isinstance(create_parent, basestring): 
 667              create_parent = 'div' 
 668          new_root = Element(create_parent) 
 669          if elements: 
 670              if isinstance(elements[0], basestring): 
 671                  new_root.text = elements[0] 
 672                  del elements[0] 
 673              new_root.extend(elements) 
 674          return new_root 
 675   
 676      if not elements: 
 677          raise etree.ParserError('No elements found') 
 678      if len(elements) > 1: 
 679          raise etree.ParserError( 
 680              "Multiple elements found (%s)" 
 681              % ', '.join([_element_name(e) for e in elements])) 
 682      el = elements[0] 
 683      if el.tail and el.tail.strip(): 
 684          raise etree.ParserError( 
 685              "Element followed by text: %r" % el.tail) 
 686      el.tail = None 
 687      return el 
  688   
 689 -def fromstring(html, base_url=None, parser=None, **kw): 
  755   
 756 -def parse(filename_or_url, parser=None, base_url=None, **kw): 
  757      """ 
 758      Parse a filename, URL, or file-like object into an HTML document 
 759      tree.  Note: this returns a tree, not an element.  Use 
 760      ``parse(...).getroot()`` to get the document root. 
 761   
 762      You can override the base URL with the ``base_url`` keyword.  This 
 763      is most useful when parsing from a file-like object. 
 764      """ 
 765      if parser is None: 
 766          parser = html_parser 
 767      return etree.parse(filename_or_url, parser, base_url=base_url, **kw) 
  768   
 770       
 771       
 772      for el in el.iter(): 
 773          if _nons(el.tag) in defs.block_tags: 
 774              return True 
 775      return False 
  776   
 778      if isinstance(el, etree.CommentBase): 
 779          return 'comment' 
 780      elif isinstance(el, basestring): 
 781          return 'string' 
 782      else: 
 783          return _nons(el.tag) 
  784   
 785   
 786   
 787   
 788   
 893   
 894  HtmlElementClassLookup._default_element_classes['form'] = FormElement 
 895   
 932   
 934      if not url: 
 935          raise ValueError("cannot submit, no URL provided") 
 936       
 937      try: 
 938          from urllib import urlencode, urlopen 
 939      except ImportError:  
 940          from urllib.request import urlopen 
 941          from urllib.parse import urlencode 
 942      if method == 'GET': 
 943          if '?' in url: 
 944              url += '&' 
 945          else: 
 946              url += '?' 
 947          url += urlencode(values) 
 948          data = None 
 949      else: 
 950          data = urlencode(values) 
 951      return urlopen(url, data) 
  952   
 954   
 962          raise KeyError( 
 963              "You cannot remove keys from ElementDict") 
  967          return item in self.inputs 
  972   
 974          return '<%s for form %s>' % ( 
 975              self.__class__.__name__, 
 976              self.inputs.form._name()) 
   977   
1043   
1071   
1072 -class TextareaElement(InputMixin, HtmlElement): 
 1073      """ 
1074      ``<textarea>`` element.  You can get the name with ``.name`` and 
1075      get/set the value with ``.value`` 
1076      """ 
1077   
1078 -    def _value__get(self): 
 1079          """ 
1080          Get/set the value (which is the contents of this element) 
1081          """ 
1082          content = self.text or '' 
1083          if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 
1084              serialisation_method = 'xml' 
1085          else: 
1086              serialisation_method = 'html' 
1087          for el in self: 
1088               
1089              content += etree.tostring(el, method=serialisation_method, encoding=unicode) 
1090          return content 
 1091 -    def _value__set(self, value): 
 1092          del self[:] 
1093          self.text = value 
 1094 -    def _value__del(self): 
 1095          self.text = '' 
1096          del self[:] 
 1097      value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 
 1098   
1099  HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 
1100   
1102      """ 
1103      ``<select>`` element.  You can get the name with ``.name``. 
1104   
1105      ``.value`` will be the value of the selected option, unless this 
1106      is a multi-select element (``<select multiple>``), in which case 
1107      it will be a set-like object.  In either case ``.value_options`` 
1108      gives the possible values. 
1109   
1110      The boolean attribute ``.multiple`` shows if this is a 
1111      multi-select. 
1112      """ 
1113   
1115          """ 
1116          Get/set the value of this select (the selected option). 
1117   
1118          If this is a multi-select, this is a set-like object that 
1119          represents all the selected options. 
1120          """ 
1121          if self.multiple: 
1122              return MultipleSelectOptions(self) 
1123          for el in _options_xpath(self): 
1124              if el.get('selected') is not None: 
1125                  value = el.get('value') 
1126                  if value is None: 
1127                      value = el.text or '' 
1128                  if value: 
1129                      value = value.strip() 
1130                  return value 
1131          return None 
 1132   
1134          if self.multiple: 
1135              if isinstance(value, basestring): 
1136                  raise TypeError( 
1137                      "You must pass in a sequence") 
1138              self.value.clear() 
1139              self.value.update(value) 
1140              return 
1141          if value is not None: 
1142              value = value.strip() 
1143              for el in _options_xpath(self): 
1144                  opt_value = el.get('value') 
1145                  if opt_value is None: 
1146                      opt_value = el.text or '' 
1147                  if opt_value: 
1148                      opt_value = opt_value.strip() 
1149                  if opt_value == value: 
1150                      checked_option = el 
1151                      break 
1152              else: 
1153                  raise ValueError( 
1154                      "There is no option with the value of %r" % value) 
1155          for el in _options_xpath(self): 
1156              if 'selected' in el.attrib: 
1157                  del el.attrib['selected'] 
1158          if value is not None: 
1159              checked_option.set('selected', '') 
 1160   
1167   
1168      value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 
1169   
1184      value_options = property(value_options, doc=value_options.__doc__) 
1185   
1187          """ 
1188          Boolean attribute: is there a ``multiple`` attribute on this element. 
1189          """ 
1190          return 'multiple' in self.attrib 
 1192          if value: 
1193              self.set('multiple', '') 
1194          elif 'multiple' in self.attrib: 
1195              del self.attrib['multiple'] 
 1196      multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__) 
 1197   
1198  HtmlElementClassLookup._default_element_classes['select'] = SelectElement 
1199   
1201      """ 
1202      Represents all the selected options in a ``<select multiple>`` element. 
1203   
1204      You can add to this set-like option to select an option, or remove 
1205      to unselect the option. 
1206      """ 
1207   
1209          self.select = select 
 1210   
1212          """ 
1213          Iterator of all the ``<option>`` elements. 
1214          """ 
1215          return iter(_options_xpath(self.select)) 
 1216      options = property(options) 
1217   
1219          for option in self.options: 
1220              if 'selected' in option.attrib: 
1221                  opt_value = option.get('value') 
1222                  if opt_value is None: 
1223                      opt_value = option.text or '' 
1224                  if opt_value: 
1225                      opt_value = opt_value.strip() 
1226                  yield opt_value 
 1227   
1228 -    def add(self, item): 
 1229          for option in self.options: 
1230              opt_value = option.get('value') 
1231              if opt_value is None: 
1232                  opt_value = option.text or '' 
1233              if opt_value: 
1234                  opt_value = opt_value.strip() 
1235              if opt_value == item: 
1236                  option.set('selected', '') 
1237                  break 
1238          else: 
1239              raise ValueError( 
1240                  "There is no option with the value %r" % item) 
 1241   
1243          for option in self.options: 
1244              opt_value = option.get('value') 
1245              if opt_value is None: 
1246                  opt_value = option.text or '' 
1247              if opt_value: 
1248                  opt_value = opt_value.strip() 
1249              if opt_value == item: 
1250                  if 'selected' in option.attrib: 
1251                      del option.attrib['selected'] 
1252                  else: 
1253                      raise ValueError( 
1254                          "The option %r is not currently selected" % item) 
1255                  break 
1256          else: 
1257              raise ValueError( 
1258                  "There is not option with the value %r" % item) 
 1259   
1261          return '<%s {%s} for select name=%r>' % ( 
1262              self.__class__.__name__, 
1263              ', '.join([repr(v) for v in self]), 
1264              self.select.name) 
  1265   
1267      """ 
1268      This object represents several ``<input type=radio>`` elements 
1269      that have the same name. 
1270   
1271      You can use this like a list, but also use the property 
1272      ``.value`` to check/uncheck inputs.  Also you can use 
1273      ``.value_options`` to get the possible values. 
1274      """ 
1275   
1277          """ 
1278          Get/set the value, which checks the radio with that value (and 
1279          unchecks any other value). 
1280          """ 
1281          for el in self: 
1282              if 'checked' in el.attrib: 
1283                  return el.get('value') 
1284          return None 
 1285   
1287          if value is not None: 
1288              for el in self: 
1289                  if el.get('value') == value: 
1290                      checked_option = el 
1291                      break 
1292              else: 
1293                  raise ValueError( 
1294                      "There is no radio input with the value %r" % value) 
1295          for el in self: 
1296              if 'checked' in el.attrib: 
1297                  del el.attrib['checked'] 
1298          if value is not None: 
1299              checked_option.set('checked', '') 
 1300   
1303   
1304      value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 
1305   
1307          """ 
1308          Returns a list of all the possible values. 
1309          """ 
1310          return [el.get('value') for el in self] 
 1311      value_options = property(value_options, doc=value_options.__doc__) 
1312   
1314          return '%s(%s)' % ( 
1315              self.__class__.__name__, 
1316              list.__repr__(self)) 
  1317   
1319      """ 
1320      Represents a group of checkboxes (``<input type=checkbox>``) that 
1321      have the same name. 
1322   
1323      In addition to using this like a list, the ``.value`` attribute 
1324      returns a set-like object that you can add to or remove from to 
1325      check and uncheck checkboxes.  You can also use ``.value_options`` 
1326      to get the possible values. 
1327      """ 
1328   
1330          """ 
1331          Return a set-like object that can be modified to check or 
1332          uncheck individual checkboxes according to their value. 
1333          """ 
1334          return CheckboxValues(self) 
 1344      value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 
1345   
1347          """ 
1348          Returns a list of all the possible values. 
1349          """ 
1350          return [el.get('value') for el in self] 
 1351      value_options = property(value_options, doc=value_options.__doc__) 
1352   
1354          return '%s(%s)' % ( 
1355              self.__class__.__name__, list.__repr__(self)) 
  1356   
1358   
1359      """ 
1360      Represents the values of the checked checkboxes in a group of 
1361      checkboxes with the same name. 
1362      """ 
1363   
1366   
1368          return iter([ 
1369              el.get('value') 
1370              for el in self.group 
1371              if 'checked' in el.attrib]) 
 1372   
1373 -    def add(self, value): 
 1374          for el in self.group: 
1375              if el.get('value') == value: 
1376                  el.set('checked', '') 
1377                  break 
1378          else: 
1379              raise KeyError("No checkbox with value %r" % value) 
 1380   
1382          for el in self.group: 
1383              if el.get('value') == value: 
1384                  if 'checked' in el.attrib: 
1385                      del el.attrib['checked'] 
1386                  else: 
1387                      raise KeyError( 
1388                          "The checkbox with value %r was already unchecked" % value) 
1389                  break 
1390          else: 
1391              raise KeyError( 
1392                  "No checkbox with value %r" % value) 
 1393   
1395          return '<%s {%s} for checkboxes name=%r>' % ( 
1396              self.__class__.__name__, 
1397              ', '.join([repr(v) for v in self]), 
1398              self.group.name) 
  1399   
1483   
1484  HtmlElementClassLookup._default_element_classes['input'] = InputElement 
1485   
1487      """ 
1488      Represents a ``<label>`` element. 
1489   
1490      Label elements are linked to other elements with their ``for`` 
1491      attribute.  You can access this element with ``label.for_element``. 
1492      """ 
1493   
1495          """ 
1496          Get/set the element this label points to.  Return None if it 
1497          can't be found. 
1498          """ 
1499          id = self.get('for') 
1500          if not id: 
1501              return None 
1502          return self.body.get_element_by_id(id) 
 1504          id = other.get('id') 
1505          if not id: 
1506              raise TypeError( 
1507                  "Element %r has no id attribute" % other) 
1508          self.set('for', id) 
 1512      for_element = property(_for_element__get, _for_element__set, _for_element__del, 
1513                             doc=_for_element__get.__doc__) 
 1514   
1515  HtmlElementClassLookup._default_element_classes['label'] = LabelElement 
1516   
1517   
1518   
1519   
1520   
1535   
1537      """Convert all tags in an XHTML tree to HTML by removing their 
1538      XHTML namespace. 
1539      """ 
1540      try: 
1541          xhtml = xhtml.getroot() 
1542      except AttributeError: 
1543          pass 
1544      prefix = "{%s}" % XHTML_NAMESPACE 
1545      prefix_len = len(prefix) 
1546      for el in xhtml.iter(prefix + "*"): 
1547          el.tag = el.tag[prefix_len:] 
 1548   
1549   
1550   
1551  __str_replace_meta_content_type = re.compile( 
1552      r'<meta http-equiv="Content-Type"[^>]*>').sub 
1553  __bytes_replace_meta_content_type = re.compile( 
1554      r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 
1555   
1556 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 
1557               encoding=None, method="html", with_tail=True, doctype=None): 
 1558      """Return an HTML string representation of the document. 
1559   
1560      Note: if include_meta_content_type is true this will create a 
1561      ``<meta http-equiv="Content-Type" ...>`` tag in the head; 
1562      regardless of the value of include_meta_content_type any existing 
1563      ``<meta http-equiv="Content-Type" ...>`` tag will be removed 
1564   
1565      The ``encoding`` argument controls the output encoding (defauts to 
1566      ASCII, with &#...; character references for any characters outside 
1567      of ASCII).  Note that you can pass the name ``'unicode'`` as 
1568      ``encoding`` argument to serialise to a unicode string. 
1569   
1570      The ``method`` argument defines the output method.  It defaults to 
1571      'html', but can also be 'xml' for xhtml output, or 'text' to 
1572      serialise to plain text without markup. 
1573   
1574      To leave out the tail text of the top-level element that is being 
1575      serialised, pass ``with_tail=False``. 
1576   
1577      The ``doctype`` option allows passing in a plain string that will 
1578      be serialised before the XML tree.  Note that passing in non 
1579      well-formed content here will make the XML output non well-formed. 
1580      Also, an existing doctype in the document tree will not be removed 
1581      when serialising an ElementTree instance. 
1582   
1583      Example:: 
1584   
1585          >>> from lxml import html 
1586          >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 
1587   
1588          >>> html.tostring(root) 
1589          b'<p>Hello<br>world!</p>' 
1590          >>> html.tostring(root, method='html') 
1591          b'<p>Hello<br>world!</p>' 
1592   
1593          >>> html.tostring(root, method='xml') 
1594          b'<p>Hello<br/>world!</p>' 
1595   
1596          >>> html.tostring(root, method='text') 
1597          b'Helloworld!' 
1598   
1599          >>> html.tostring(root, method='text', encoding=unicode) 
1600          u'Helloworld!' 
1601   
1602          >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 
1603          >>> html.tostring(root[0], method='text', encoding=unicode) 
1604          u'Helloworld!TAIL' 
1605   
1606          >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 
1607          u'Helloworld!' 
1608   
1609          >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 
1610          >>> html.tostring(doc, method='html', encoding=unicode) 
1611          u'<html><body><p>Hello<br>world!</p></body></html>' 
1612   
1613          >>> print(html.tostring(doc, method='html', encoding=unicode, 
1614          ...          doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 
1615          ...                  ' "http://www.w3.org/TR/html4/strict.dtd">')) 
1616          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 
1617          <html><body><p>Hello<br>world!</p></body></html> 
1618      """ 
1619      html = etree.tostring(doc, method=method, pretty_print=pretty_print, 
1620                            encoding=encoding, with_tail=with_tail, 
1621                            doctype=doctype) 
1622      if method == 'html' and not include_meta_content_type: 
1623          if isinstance(html, str): 
1624              html = __str_replace_meta_content_type('', html) 
1625          else: 
1626              html = __bytes_replace_meta_content_type(bytes(), html) 
1627      return html 
 1628   
1629  tostring.__doc__ = __fix_docstring(tostring.__doc__) 
1630   
1632      """ 
1633      Open the HTML document in a web browser, saving it to a temporary 
1634      file to open it.  Note that this does not delete the file after 
1635      use.  This is mainly meant for debugging. 
1636      """ 
1637      import os 
1638      import webbrowser 
1639      import tempfile 
1640      if not isinstance(doc, etree._ElementTree): 
1641          doc = etree.ElementTree(doc) 
1642      handle, fn = tempfile.mkstemp(suffix='.html') 
1643      f = os.fdopen(handle, 'wb') 
1644      try: 
1645          doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 
1646      finally: 
1647           
1648          f.close() 
1649      url = 'file://' + fn.replace(os.path.sep, '/') 
1650      print(url) 
1651      webbrowser.open(url) 
 1652   
1653   
1654   
1655   
1656   
1658      """An HTML parser that is configured to return lxml.html Element 
1659      objects. 
1660      """ 
 1664   
1666      """An XML parser that is configured to return lxml.html Element 
1667      objects. 
1668   
1669      Note that this parser is not really XHTML aware unless you let it 
1670      load a DTD that declares the HTML entities.  To do this, make sure 
1671      you have the XHTML DTDs installed in your catalogs, and create the 
1672      parser like this:: 
1673   
1674          >>> parser = XHTMLParser(load_dtd=True) 
1675   
1676      If you additionally want to validate the document, use this:: 
1677   
1678          >>> parser = XHTMLParser(dtd_validation=True) 
1679   
1680      For catalog support, see http://www.xmlsoft.org/catalog.html. 
1681      """ 
 1685   
1687      """Create a new HTML Element. 
1688   
1689      This can also be used for XHTML documents. 
1690      """ 
1691      v = html_parser.makeelement(*args, **kw) 
1692      return v 
 1693   
1694  html_parser = HTMLParser() 
1695  xhtml_parser = XHTMLParser() 
1696