More lxml Syntax Examples

More lxml Syntax Examples

Continued from lxml HTML Scraping Syntax Examples

Content:

  • Python Code
  • Resulting Output

Python Code

#!/usr/local/bin/python2.7
# -*- coding: UTF-8 -*-
"""lxmlScrapingExamplesMore.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file]
Usage:   lxmlScrapingExamplesMore.py INURL                                                  OUTFILEPATH
Example: lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt
"""
import sys,os

# joe professional opinion: package structure a bit goofy!   🙂
import lxml, lxml.html


def lxmlScrapingExamples(myinurl, myoutfilepath):
    myinurl = 'http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm' # what gets called
    print myinurl
    print myoutfilepath

    #Example 1 redo for myinurl new value 
    print "\n\nExample 1 - basic parsing of url"
    htmltree = lxml.html.parse(myinurl)
    # print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True))

    #Example 5 - xpath tag with class=value     N.B. backslashes for newLines, etc., DISAPPEAR in WordPress Marldown
    # see http://lxml.de/xpathxslt.html
    print "\n\nExample 5 - xpath tag with class=value"
    print """htmltree.xpath("//h1[@class='title topictitle1']")[0].text = %s"""%(htmltree.xpath("//h1[@class='title topictitle1']")[0].text)
    print """htmltree.xpath("//p[@class='shortdesc']")[0].text = %s"""%(htmltree.xpath("//p[@class='shortdesc']")[0].text)     
    print """len(htmltree.xpath("//var[@class='keyword varname']")) = %s"""%(len(htmltree.xpath("//var[@class='keyword varname']")))
    print """htmltree.xpath("//var[@class='keyword varname']")[0].text = %s"""%(htmltree.xpath("//var[@class='keyword varname']")[0].text)

    #Example 6 - parent   and   ElementVariables with   //  VS  .//   
    print "\n\nExample 6 - parent and ElementVariables"
    print """syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent())
    syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent()
    print """syntax_div = %s"""%(syntax_div)
    print syntax_div_2string,'\n'
    print """syntax_div = %s"""%(syntax_div)  

    print "\n\nsyntax_div.xpath     //   VS  .//  \n\n"   
    print "// uses  htmltree"
    print """    syntax_div.xpath("count(//samp)") = %s"""%(syntax_div.xpath("count(//samp)"))  
    print '            equals\n'
    print """    htmltree.xpath("count(//samp)") = %s"""%(htmltree.xpath("count(//samp)"))  
    print """    syntax_div.xpath("count(//var)") = %s"""%(syntax_div.xpath("count(//var)"))  
    print '            equals'
    print """    htmltree.xpath("count(//var)") = %s"""%(htmltree.xpath("count(//var)"))  
    print '\nVS   .// uses  syntax_div ONLY'
    print """    htmltree.xpath("count(.//samp)") = %s"""%(htmltree.xpath("count(.//samp)"))  
    print """    syntax_div.xpath("count(.//var)") = %s"""%(syntax_div.xpath("count(.//var)")) 
    print "\n"    
    print syntax_div_ipython_discovery

    #Example 7 - xpath select element by text
    print "\n\nExample 7 - xpath select element by text"
    print """description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent())
    description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent()
    print """description_div = %s"""%(description_div)
    print description_div_2string,'\n'
    print """description_div.xpath("./p")[0].text = %s"""%(description_div.xpath("./p")[0].text)


    #Example 8 - get all text in an element
    print "\n\nExample 8 - get all text in element\nsee http://lxml.de/lxmlhtml.html#html-element-methods"
    print """xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent())    
    example_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent()
    print example_div_2string,'\n'
    print "example_div.text_content() = %s"%(example_div.text_content())


    #Example 9 - zip/dict   data terms & data definitions
    print "\n\nExample 9 - zipping data terms & data definitions"
    print """options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent())    
    options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent()
    print options_div_2string
    terms = [t.text for t in options_div.xpath("dl/dt/samp/var")]
    defs  = [d.text for d in options_div.xpath("dl/dd")]
    term_def_dict = dict(zip(terms, defs))
    print '\nterm_def_dict'
    for k, v in term_def_dict.iteritems():
        print '    %s: %s'%(k,v)


#print lxml.etree.tostring(options_div, pretty_print=True)

options_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Options</h2>
  <dl class="dl">
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">acl_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the ACL policy that is applied to the named object.
      The ACL policy must exist, or an error is displayed. 
      <p class="p">Examples of
      the ACL names are 
        <samp class="ph codeph">default-root</samp>, 
        <samp class="ph codeph">test</samp>, 
        <samp class="ph codeph">default-management</samp>,
        and 
        <samp class="ph codeph">pubs_acl3</samp>.</p>
    </dd>
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">object_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the object to which to apply the named ACL policy. The
    object name must exist, or an error is displayed. 
      <p class="p">Examples of object
      names are:
      </p>
      <ul class="ul">
        <li class="li">
          <samp class="ph codeph">/Management/Groups/Travel</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/WebSEAL</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/Management</samp>
        </li>
      </ul>
    </dd>
  </dl>
</div>"""



example_div_2string = """<div class="example">
  <h2 class="title sectiontitle">Example</h2>
  <div class="p">The following example attaches the ACL policy, 
    <samp class="ph codeph">pubs_acl3</samp>, 
    to the protected object, 
    <samp class="ph codeph">/Management</samp>: 
    <pre class="pre codeblock">
      <code>pdadmin sec_master> acl attach /Management pubs_acl3</code>
    </pre>
  </div>
</div>
"""    

description_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span>
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var></samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div>"""

syntax_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span> 
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var>
    </samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div>"""

syntax_div_ipython_discovery = """In [54]: syntax_div.
syntax_div.addnext             syntax_div.get_element_by_id   syntax_div.keys
syntax_div.addprevious         syntax_div.getchildren         syntax_div.label
syntax_div.append              syntax_div.getiterator         syntax_div.make_links_absolut
syntax_div.attrib              syntax_div.getnext             syntax_div.makeelement
syntax_div.base                syntax_div.getparent           syntax_div.nsmap
syntax_div.base_url            syntax_div.getprevious         syntax_div.prefix
syntax_div.body                syntax_div.getroottree         syntax_div.remove
syntax_div.clear               syntax_div.head                syntax_div.replace
syntax_div.cssselect           syntax_div.index               syntax_div.resolve_base_href
syntax_div.drop_tag            syntax_div.insert              syntax_div.rewrite_links
syntax_div.drop_tree           syntax_div.items               syntax_div.set
syntax_div.extend              syntax_div.iter                syntax_div.sourceline
syntax_div.find                syntax_div.iterancestors       syntax_div.tag
syntax_div.find_class          syntax_div.iterchildren        syntax_div.tail
syntax_div.find_rel_links      syntax_div.iterdescendants     syntax_div.text
syntax_div.findall             syntax_div.iterfind            syntax_div.text_content
syntax_div.findtext            syntax_div.iterlinks           syntax_div.values
syntax_div.forms               syntax_div.itersiblings        syntax_div.xpath
syntax_div.get                 syntax_div.itertext
"""

NUM_ARGS = 2
def main():
    args = sys.argv[1:]
    if len(args) != NUM_ARGS or "-h" in args or "--help" in args:
        print __doc__
        s = raw_input('hit return to quit')
        sys.exit(2)
    lxmlScrapingExamples(args[0], args[1])

if __name__ == '__main__':
    main()

Resulting Output

>lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt
http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm
lxmlScrapingOutput.txt


Example 1 - basic parsing of url


Example 5 - xpath tag with class=value
htmltree.xpath("//h1[@class='title topictitle1']")[0].text = acl attach
htmltree.xpath("//p[@class='shortdesc']")[0].text = Attaches an ACL policy to a protected object. If the protected
object already has an ACL attached, the ACL is replaced with a new
one. 
len(htmltree.xpath("//var[@class='keyword varname']")) = 4
htmltree.xpath("//var[@class='keyword varname']")[0].text = object_name


Example 6 - parent and ElementVariables
syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = <Element div at 0xb7df00>
syntax_div = <Element div at 0xb7df00>
<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span> 
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var>
    </samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div> 

syntax_div = <Element div at 0xb7df00>


syntax_div.xpath     //   VS  .//  


// uses  htmltree
    syntax_div.xpath("count(//samp)") = 14.0
            equals

    htmltree.xpath("count(//samp)") = 14.0
    syntax_div.xpath("count(//var)") = 4.0
            equals
    htmltree.xpath("count(//var)") = 4.0

VS   .// uses  syntax_div ONLY
    htmltree.xpath("count(.//samp)") = 14.0
    syntax_div.xpath("count(.//var)") = 2.0


In [54]: syntax_div.
syntax_div.addnext             syntax_div.get_element_by_id   syntax_div.keys
syntax_div.addprevious         syntax_div.getchildren         syntax_div.label
syntax_div.append              syntax_div.getiterator         syntax_div.make_links_absolut
syntax_div.attrib              syntax_div.getnext             syntax_div.makeelement
syntax_div.base                syntax_div.getparent           syntax_div.nsmap
syntax_div.base_url            syntax_div.getprevious         syntax_div.prefix
syntax_div.body                syntax_div.getroottree         syntax_div.remove
syntax_div.clear               syntax_div.head                syntax_div.replace
syntax_div.cssselect           syntax_div.index               syntax_div.resolve_base_href
syntax_div.drop_tag            syntax_div.insert              syntax_div.rewrite_links
syntax_div.drop_tree           syntax_div.items               syntax_div.set
syntax_div.extend              syntax_div.iter                syntax_div.sourceline
syntax_div.find                syntax_div.iterancestors       syntax_div.tag
syntax_div.find_class          syntax_div.iterchildren        syntax_div.tail
syntax_div.find_rel_links      syntax_div.iterdescendants     syntax_div.text
syntax_div.findall             syntax_div.iterfind            syntax_div.text_content
syntax_div.findtext            syntax_div.iterlinks           syntax_div.values
syntax_div.forms               syntax_div.itersiblings        syntax_div.xpath
syntax_div.get                 syntax_div.itertext



Example 7 - xpath select element by text
description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = <Element div at 0xd10e40>
description_div = <Element div at 0xd10e40>
<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span>
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var></samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div> 

description_div.xpath("./p")[0].text = At most, one ACL can be attached
to a given protected object. The same ACL can be attached to multiple
protected objects. Ensure that you are familiar with ACL management before you
use this function.


Example 8 - get all text in element
see http://lxml.de/lxmlhtml.html#html-element-methods
xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = <Element div at 0xd10e70>
<div class="example">
  <h2 class="title sectiontitle">Example</h2>
  <div class="p">The following example attaches the ACL policy, 
    <samp class="ph codeph">pubs_acl3</samp>, 
    to the protected object, 
    <samp class="ph codeph">/Management</samp>: 
    <pre class="pre codeblock">
      <code>pdadmin sec_master> acl attach /Management pubs_acl3</code>
    </pre>
  </div>
</div>


example_div.text_content() = ExampleThe following example attaches the
ACL policy, pubs_acl3, to the protected object, /Management: pdadmin sec_master> acl attach /Management pubs_acl3




Example 9 - zipping data terms & data definitions
options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = <Element div at 0xd10d80>
<div class="section">
  <h2 class="title sectiontitle">Options</h2>
  <dl class="dl">
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">acl_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the ACL policy that is applied to the named object.
      The ACL policy must exist, or an error is displayed. 
      <p class="p">Examples of
      the ACL names are 
        <samp class="ph codeph">default-root</samp>, 
        <samp class="ph codeph">test</samp>, 
        <samp class="ph codeph">default-management</samp>,
        and 
        <samp class="ph codeph">pubs_acl3</samp>.</p>
    </dd>
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">object_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the object to which to apply the named ACL policy. The
    object name must exist, or an error is displayed. 
      <p class="p">Examples of object
      names are:
      </p>
      <ul class="ul">
        <li class="li">
          <samp class="ph codeph">/Management/Groups/Travel</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/WebSEAL</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/Management</samp>
        </li>
      </ul>
    </dd>
  </dl>
</div>

term_def_dict
    object_name: Specifies the object to which to apply the named ACL policy. The
object name must exist, or an error is displayed. 
    acl_name: Specifies the ACL policy that is applied to the named object.
The ACL policy must exist, or an error is displayed. 

>

#lxml, #screen-sxraping