More lxml Syntax Examples

More lxml Syntax Examples

Continued from lxml HTML Scraping Syntax Examples

Content:

  • Python Code
  • Resulting Output

Python Code

#!/usr/local/bin/python2.7
# -*- coding: UTF-8 -*-
"""lxmlScrapingExamplesMore.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file]
Usage:   lxmlScrapingExamplesMore.py INURL                                                  OUTFILEPATH
Example: lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt
"""
import sys,os

# joe professional opinion: package structure a bit goofy!   🙂
import lxml, lxml.html


def lxmlScrapingExamples(myinurl, myoutfilepath):
    myinurl = 'http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm' # what gets called
    print myinurl
    print myoutfilepath

    #Example 1 redo for myinurl new value 
    print "\n\nExample 1 - basic parsing of url"
    htmltree = lxml.html.parse(myinurl)
    # print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True))

    #Example 5 - xpath tag with class=value     N.B. backslashes for newLines, etc., DISAPPEAR in WordPress Marldown
    # see http://lxml.de/xpathxslt.html
    print "\n\nExample 5 - xpath tag with class=value"
    print """htmltree.xpath("//h1[@class='title topictitle1']")[0].text = %s"""%(htmltree.xpath("//h1[@class='title topictitle1']")[0].text)
    print """htmltree.xpath("//p[@class='shortdesc']")[0].text = %s"""%(htmltree.xpath("//p[@class='shortdesc']")[0].text)     
    print """len(htmltree.xpath("//var[@class='keyword varname']")) = %s"""%(len(htmltree.xpath("//var[@class='keyword varname']")))
    print """htmltree.xpath("//var[@class='keyword varname']")[0].text = %s"""%(htmltree.xpath("//var[@class='keyword varname']")[0].text)

    #Example 6 - parent   and   ElementVariables with   //  VS  .//   
    print "\n\nExample 6 - parent and ElementVariables"
    print """syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent())
    syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent()
    print """syntax_div = %s"""%(syntax_div)
    print syntax_div_2string,'\n'
    print """syntax_div = %s"""%(syntax_div)  

    print "\n\nsyntax_div.xpath     //   VS  .//  \n\n"   
    print "// uses  htmltree"
    print """    syntax_div.xpath("count(//samp)") = %s"""%(syntax_div.xpath("count(//samp)"))  
    print '            equals\n'
    print """    htmltree.xpath("count(//samp)") = %s"""%(htmltree.xpath("count(//samp)"))  
    print """    syntax_div.xpath("count(//var)") = %s"""%(syntax_div.xpath("count(//var)"))  
    print '            equals'
    print """    htmltree.xpath("count(//var)") = %s"""%(htmltree.xpath("count(//var)"))  
    print '\nVS   .// uses  syntax_div ONLY'
    print """    htmltree.xpath("count(.//samp)") = %s"""%(htmltree.xpath("count(.//samp)"))  
    print """    syntax_div.xpath("count(.//var)") = %s"""%(syntax_div.xpath("count(.//var)")) 
    print "\n"    
    print syntax_div_ipython_discovery

    #Example 7 - xpath select element by text
    print "\n\nExample 7 - xpath select element by text"
    print """description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent())
    description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent()
    print """description_div = %s"""%(description_div)
    print description_div_2string,'\n'
    print """description_div.xpath("./p")[0].text = %s"""%(description_div.xpath("./p")[0].text)


    #Example 8 - get all text in an element
    print "\n\nExample 8 - get all text in element\nsee http://lxml.de/lxmlhtml.html#html-element-methods"
    print """xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent())    
    example_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent()
    print example_div_2string,'\n'
    print "example_div.text_content() = %s"%(example_div.text_content())


    #Example 9 - zip/dict   data terms & data definitions
    print "\n\nExample 9 - zipping data terms & data definitions"
    print """options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent())    
    options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent()
    print options_div_2string
    terms = [t.text for t in options_div.xpath("dl/dt/samp/var")]
    defs  = [d.text for d in options_div.xpath("dl/dd")]
    term_def_dict = dict(zip(terms, defs))
    print '\nterm_def_dict'
    for k, v in term_def_dict.iteritems():
        print '    %s: %s'%(k,v)


#print lxml.etree.tostring(options_div, pretty_print=True)

options_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Options</h2>
  <dl class="dl">
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">acl_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the ACL policy that is applied to the named object.
      The ACL policy must exist, or an error is displayed. 
      <p class="p">Examples of
      the ACL names are 
        <samp class="ph codeph">default-root</samp>, 
        <samp class="ph codeph">test</samp>, 
        <samp class="ph codeph">default-management</samp>,
        and 
        <samp class="ph codeph">pubs_acl3</samp>.</p>
    </dd>
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">object_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the object to which to apply the named ACL policy. The
    object name must exist, or an error is displayed. 
      <p class="p">Examples of object
      names are:
      </p>
      <ul class="ul">
        <li class="li">
          <samp class="ph codeph">/Management/Groups/Travel</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/WebSEAL</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/Management</samp>
        </li>
      </ul>
    </dd>
  </dl>
</div>"""



example_div_2string = """<div class="example">
  <h2 class="title sectiontitle">Example</h2>
  <div class="p">The following example attaches the ACL policy, 
    <samp class="ph codeph">pubs_acl3</samp>, 
    to the protected object, 
    <samp class="ph codeph">/Management</samp>: 
    <pre class="pre codeblock">
      <code>pdadmin sec_master> acl attach /Management pubs_acl3</code>
    </pre>
  </div>
</div>
"""    

description_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span>
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var></samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div>"""

syntax_div_2string = """<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span> 
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var>
    </samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div>"""

syntax_div_ipython_discovery = """In [54]: syntax_div.
syntax_div.addnext             syntax_div.get_element_by_id   syntax_div.keys
syntax_div.addprevious         syntax_div.getchildren         syntax_div.label
syntax_div.append              syntax_div.getiterator         syntax_div.make_links_absolut
syntax_div.attrib              syntax_div.getnext             syntax_div.makeelement
syntax_div.base                syntax_div.getparent           syntax_div.nsmap
syntax_div.base_url            syntax_div.getprevious         syntax_div.prefix
syntax_div.body                syntax_div.getroottree         syntax_div.remove
syntax_div.clear               syntax_div.head                syntax_div.replace
syntax_div.cssselect           syntax_div.index               syntax_div.resolve_base_href
syntax_div.drop_tag            syntax_div.insert              syntax_div.rewrite_links
syntax_div.drop_tree           syntax_div.items               syntax_div.set
syntax_div.extend              syntax_div.iter                syntax_div.sourceline
syntax_div.find                syntax_div.iterancestors       syntax_div.tag
syntax_div.find_class          syntax_div.iterchildren        syntax_div.tail
syntax_div.find_rel_links      syntax_div.iterdescendants     syntax_div.text
syntax_div.findall             syntax_div.iterfind            syntax_div.text_content
syntax_div.findtext            syntax_div.iterlinks           syntax_div.values
syntax_div.forms               syntax_div.itersiblings        syntax_div.xpath
syntax_div.get                 syntax_div.itertext
"""

NUM_ARGS = 2
def main():
    args = sys.argv[1:]
    if len(args) != NUM_ARGS or "-h" in args or "--help" in args:
        print __doc__
        s = raw_input('hit return to quit')
        sys.exit(2)
    lxmlScrapingExamples(args[0], args[1])

if __name__ == '__main__':
    main()

Resulting Output

>lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt
http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm
lxmlScrapingOutput.txt


Example 1 - basic parsing of url


Example 5 - xpath tag with class=value
htmltree.xpath("//h1[@class='title topictitle1']")[0].text = acl attach
htmltree.xpath("//p[@class='shortdesc']")[0].text = Attaches an ACL policy to a protected object. If the protected
object already has an ACL attached, the ACL is replaced with a new
one. 
len(htmltree.xpath("//var[@class='keyword varname']")) = 4
htmltree.xpath("//var[@class='keyword varname']")[0].text = object_name


Example 6 - parent and ElementVariables
syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = <Element div at 0xb7df00>
syntax_div = <Element div at 0xb7df00>
<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span> 
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var>
    </samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div> 

syntax_div = <Element div at 0xb7df00>


syntax_div.xpath     //   VS  .//  


// uses  htmltree
    syntax_div.xpath("count(//samp)") = 14.0
            equals

    htmltree.xpath("count(//samp)") = 14.0
    syntax_div.xpath("count(//var)") = 4.0
            equals
    htmltree.xpath("count(//var)") = 4.0

VS   .// uses  syntax_div ONLY
    htmltree.xpath("count(.//samp)") = 14.0
    syntax_div.xpath("count(.//var)") = 2.0


In [54]: syntax_div.
syntax_div.addnext             syntax_div.get_element_by_id   syntax_div.keys
syntax_div.addprevious         syntax_div.getchildren         syntax_div.label
syntax_div.append              syntax_div.getiterator         syntax_div.make_links_absolut
syntax_div.attrib              syntax_div.getnext             syntax_div.makeelement
syntax_div.base                syntax_div.getparent           syntax_div.nsmap
syntax_div.base_url            syntax_div.getprevious         syntax_div.prefix
syntax_div.body                syntax_div.getroottree         syntax_div.remove
syntax_div.clear               syntax_div.head                syntax_div.replace
syntax_div.cssselect           syntax_div.index               syntax_div.resolve_base_href
syntax_div.drop_tag            syntax_div.insert              syntax_div.rewrite_links
syntax_div.drop_tree           syntax_div.items               syntax_div.set
syntax_div.extend              syntax_div.iter                syntax_div.sourceline
syntax_div.find                syntax_div.iterancestors       syntax_div.tag
syntax_div.find_class          syntax_div.iterchildren        syntax_div.tail
syntax_div.find_rel_links      syntax_div.iterdescendants     syntax_div.text
syntax_div.findall             syntax_div.iterfind            syntax_div.text_content
syntax_div.findtext            syntax_div.iterlinks           syntax_div.values
syntax_div.forms               syntax_div.itersiblings        syntax_div.xpath
syntax_div.get                 syntax_div.itertext



Example 7 - xpath select element by text
description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = <Element div at 0xd10e40>
description_div = <Element div at 0xd10e40>
<div class="section">
  <h2 class="title sectiontitle">Syntax</h2>
  <p class="p">
    <span class="keyword cmdname">acl attach</span>
    <samp class="ph codeph">
      <var class="keyword varname">object_name</var></samp> 
    <samp class="ph codeph">
      <var class="keyword varname">acl_name</var>
    </samp>
  </p>
 </div> 

description_div.xpath("./p")[0].text = At most, one ACL can be attached
to a given protected object. The same ACL can be attached to multiple
protected objects. Ensure that you are familiar with ACL management before you
use this function.


Example 8 - get all text in element
see http://lxml.de/lxmlhtml.html#html-element-methods
xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = <Element div at 0xd10e70>
<div class="example">
  <h2 class="title sectiontitle">Example</h2>
  <div class="p">The following example attaches the ACL policy, 
    <samp class="ph codeph">pubs_acl3</samp>, 
    to the protected object, 
    <samp class="ph codeph">/Management</samp>: 
    <pre class="pre codeblock">
      <code>pdadmin sec_master> acl attach /Management pubs_acl3</code>
    </pre>
  </div>
</div>


example_div.text_content() = ExampleThe following example attaches the
ACL policy, pubs_acl3, to the protected object, /Management: pdadmin sec_master> acl attach /Management pubs_acl3




Example 9 - zipping data terms & data definitions
options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = <Element div at 0xd10d80>
<div class="section">
  <h2 class="title sectiontitle">Options</h2>
  <dl class="dl">
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">acl_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the ACL policy that is applied to the named object.
      The ACL policy must exist, or an error is displayed. 
      <p class="p">Examples of
      the ACL names are 
        <samp class="ph codeph">default-root</samp>, 
        <samp class="ph codeph">test</samp>, 
        <samp class="ph codeph">default-management</samp>,
        and 
        <samp class="ph codeph">pubs_acl3</samp>.</p>
    </dd>
    <dt class="dt dlterm">
      <samp class="ph codeph">
        <var class="keyword varname">object_name</var>
      </samp>
    </dt>
    <dd class="dd">Specifies the object to which to apply the named ACL policy. The
    object name must exist, or an error is displayed. 
      <p class="p">Examples of object
      names are:
      </p>
      <ul class="ul">
        <li class="li">
          <samp class="ph codeph">/Management/Groups/Travel</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/WebSEAL</samp>
        </li>
        <li class="li">
          <samp class="ph codeph">/Management</samp>
        </li>
      </ul>
    </dd>
  </dl>
</div>

term_def_dict
    object_name: Specifies the object to which to apply the named ACL policy. The
object name must exist, or an error is displayed. 
    acl_name: Specifies the ACL policy that is applied to the named object.
The ACL policy must exist, or an error is displayed. 

>
Advertisements

lxml HTML Scraping Syntax Examples

lxml Syntax Examples

Content:

  • Python Code
  • Resulting Output

Python Code

#!/usr/local/bin/python2.7
# -*- coding: UTF-8 -*-
"""lxmlScrapingExamples.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file]
Usage:   lxmlScrapingExamples.py INURL                                                  OUTFILEPATH
Example: lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
"""
import sys

# joe professional opinion: package structure a bit goofy!   🙂
import lxml, lxml.html


def lxmlScrapingExamples(myinurl, myoutfilepath):
    print myinurl
    print myoutfilepath

    #Example 1 - basic parsing of url - slightly altered from: http://stackoverflow.com/a/14303564/601770
    print "\n\nExample 1 - basic parsing of url"
    htmltree = lxml.html.parse(myinurl)
    print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True))



    #Example 2 - syntax examples [css_selector, xpath] - slightly altered from: http://stackoverflow.com/a/603630/601770
    print "\n\nExample 2 - syntax examples [css_selector, xpath]"
    # joe comment - i don't know why htmltree DOESN'T WORK DIRECTLY in this example it generates error:
    #     more lxml package/module/class/function assymetry?
    '''
    File "C:\1d\PythonPjs\kivyPjs\IBMsecurityAPIclientsPj\IBMsecurityAPIclient\ngExamples.py", line 28, in lxmlScrapingExamples
        for a in mySearchTree.cssselect('tr a'):
    AttributeError: 'lxml.etree._ElementTree' object has no attribute 'cssselect'    
    '''
    #mySearchTree = htmltree
    mySearchTree = lxml.html.fromstring(lxml.etree.tostring(htmltree))         
    # Find all 'a' elements inside 'tr' table rows with css selector
    print "Find all 'a' elements inside 'tr' table rows with css selector"
    for itm in mySearchTree.cssselect('tr a'):
        print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))    
    # Find all 'a' elements inside 'tr' table rows with xpath
    print "Find all 'a' elements inside 'tr' table rows with xpath"
    for itm in mySearchTree.xpath('.//tr/*/a'):
        print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))

    #Example 3 - syntax examples [xpath, .findall(), .getchildren()] - slightly altered from: http://stackoverflow.com/a/9920703/601770
    print "\n\nExample 3 - syntax examples [xpath, .findall(), .getchildren()] "
    page = htmltree
    rows = page.xpath("body/table")[1].findall("tr")   # table [1] is the 2nd table in MY example html
    data = list()
    for row in rows:
        data.append([c.text for c in row.getchildren()])
    for itm in data[4:]: print(itm)

    #Example 4 - following sibling [] - slightly altered from: http://stackoverflow.com/questions/3139402/how-to-select-following-sibling-xml-tag-using-xpath
    print "\n\nExample 4 - following sibling []"
    sibEx = '''
    <html>
    <head>
    <title>following sibling</title>
    </head>
    <body>
    <table border>    
    <tr>
        <td class="name">Brand</td>
        <td class="desc">Intel</td>
    </tr>
    <tr>
        <td class="name">Series</td>
        <td class="desc">Core i5</td>
    </tr>
    <tr>
        <td class="name">Cores</td>
        <td class="desc">4</td>
    </tr>
    <tr>
        <td class="name">Socket</td>
        <td class="desc">LGA 1156</td>    
    </tr>

    <tr>
        <td class="name">Brand</td>
        <td class="desc">AMD</td>
    </tr>
    <tr>
        <td class="name">Series</td>
        <td class="desc">Phenom II X4</td>
    </tr>
    <tr>
        <td class="name">Cores</td>
        <td class="desc">4</td>
    </tr>
    <tr>
        <td class="name">Socket</td>
        <td class="desc">Socket AM3</td>
    </tr>
    </table>
    </body>
    </html>    
    '''
    parsedDocument = lxml.html.fromstring(sibEx)

    # bad
    #rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']")
    #rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']/td[@class='desc']")
    #r = parsedDocument.xpath(tr/td[@class="name"])=='Brand')
    # r = parsedDocument.tr[td[@class='name'] ='Brand'].text
    #r = parsedDocument.tr[td[@class='name'] ='Brand']/td[@class='desc'].text
    #if(parsedDocument.xpath(tr/td[@class="name"])=='Brand'):

    # good
    #print "parsedDocument.xpath('/html/body/table/tr') = %s"%(parsedDocument.xpath('/html/body/table/tr'))
    print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']"))
    print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text)



    print '\n\n\n'


NUM_ARGS = 2
def main():
    args = sys.argv[1:]
    if len(args) != NUM_ARGS or "-h" in args or "--help" in args:
        print __doc__
        s = raw_input('hit return to quit')
        sys.exit(2)
    lxmlScrapingExamples(args[0], args[1])

if __name__ == '__main__':
    main()

Resulting Output

>lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
http://joecodeswell.org/examples/dlwebfiles/htmlExample.html
lxmlScrapingOutput.txt


Example 1 - basic parsing of url
lxml.etree.tostring(htmltree, pretty_print=True) = <!DOCTYPE html>
<html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=windows-1252"/>
    <title>lxml htmlExamples.html</title>
  </head>
  <body>
    <h1>lxml htmlExamples.html for Joe Codeswell examples - dlwebfiles</h1>

    <h2>Example 1</h2>
    <ul><li><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid">carol.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></li>
    </ul><h2>Example 2</h2>
    <table align="left" border="0" cellspacing="0" cellpadding="0" width="100%"><tr align="left" valign="top"><th>Name</th>
        <th>File Name & Link</th>
      </tr><tr align="left" valign="top"><td>Ave Verum</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></td></tr><tr align="left" valign="top"><td>A Carol</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid">carol.mid</a></td></tr><tr align="left" valign="top"><td>Steiner Amen?</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></td></tr></table><h2>Example 3</h2>
    <table border=""><tr align="LEFT"><th colspan="38">Main Subject</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>

    <th valign="TOP" colspan="18">part1</th>
    <th valign="TOP" colspan="18">part2</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>
    <th valign="TOP" colspan="9">sub-part1</th>
    <th valign="TOP" colspan="9">sub-part2</th>
    <th valign="TOP" colspan="9">sub-part3</th>
    <th valign="TOP" colspan="9">sub-part4</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>
    <th valign="TOP" colspan="1">subject1</th>
    <th valign="TOP" colspan="1">subject2</th>

    <th valign="TOP" colspan="1">subject10</th>
    <th valign="TOP" colspan="1">subject11</th>
    <th valign="TOP" colspan="1">subject12</th>
    <th valign="TOP" colspan="1">subject13</th>
    <th valign="TOP" colspan="1">subject14</th>
    <th valign="TOP" colspan="1">subject15</th>
    <th valign="TOP" colspan="1">subject16</th>

    <th valign="TOP" colspan="1">subject17</th>
    <th valign="TOP" colspan="1">subject18</th>
    <th valign="TOP" colspan="1">subject19</th>
    <th valign="TOP" colspan="1">subject20</th>
    <th valign="TOP" colspan="1">subject21</th>
    <th valign="TOP" colspan="1">subject22</th>
    <th valign="TOP" colspan="1">subject23</th>
    <th valign="TOP" colspan="1">subject24</th>
    <th valign="TOP" colspan="1">subject25</th>

    <th valign="TOP" colspan="1">subject26</th>
    <th valign="TOP" colspan="1">subject27</th>
    <th valign="TOP" colspan="1">subject28</th>
    <th valign="TOP" colspan="1">subject29</th>
    <th valign="TOP" colspan="1">subject30</th>
    <th valign="TOP" colspan="1">subject31</th>
    <th valign="TOP" colspan="1">subject32</th>
    <th valign="TOP" colspan="1">subject33</th>
    <th valign="TOP" colspan="1">subject34</th>

    <th valign="TOP" colspan="1">subject35</th>
    <th valign="TOP" colspan="1">subject36</th>
    </tr><tr align="RIGHT"><th align="LEFT" valign="TOP" rowspan="12">2050</th>
    <th align="LEFT">January</th>
    <td>0</td>
    <td>1</td>
    <td>3</td>
    <td>0</td>

    <td>4</td>
    <td>16</td>
    <td>0</td>
    <td>6</td>
    <td>2</td>
    <td>2</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>

    <td>3</td>
    <td>2</td>
    <td>0</td>
    <td>26</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>

    <td>5</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>
    <td>2</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">February</th>
    <td>1</td>
    <td>0</td>

    <td>8</td>
    <td>0</td>
    <td>2</td>
    <td>4</td>
    <td>1</td>
    <td>6</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>25</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>

    <td>2</td>
    <td>0</td>
    <td>4</td>
    <td>14</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">March</th>

    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>4</td>
    <td>7</td>
    <td>0</td>
    <td>9</td>
    <td>2</td>

    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>9</td>
    <td>0</td>
    <td>45</td>
    <td>1</td>

    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>10</td>
    <td>16</td>
    <td>0</td>
    <td>5</td>
    <td>1</td>

    <td>1</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>

    </tr><tr align="RIGHT"><th align="LEFT">April</th>
    <td>1</td>
    <td>0</td>
    <td>5</td>
    <td>0</td>
    <td>3</td>
    <td>12</td>
    <td>1</td>

    <td>11</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>3</td>
    <td>2</td>

    <td>34</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>
    <td>6</td>
    <td>18</td>
    <td>1</td>

    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>5</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">May</th>
    <td>7</td>
    <td>0</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>

    <td>4</td>
    <td>1</td>
    <td>13</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>2</td>
    <td>0</td>
    <td>1</td>

    <td>7</td>
    <td>1</td>
    <td>30</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>5</td>

    <td>12</td>
    <td>0</td>
    <td>4</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>6</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">June</th>
    <td>0</td>
    <td>1</td>
    <td>14</td>

    <td>0</td>
    <td>7</td>
    <td>15</td>
    <td>0</td>
    <td>17</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>
    <td>5</td>

    <td>0</td>
    <td>1</td>
    <td>3</td>
    <td>0</td>
    <td>24</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>5</td>

    <td>0</td>
    <td>6</td>
    <td>13</td>
    <td>1</td>
    <td>9</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">July</th>
    <td>0</td>

    <td>1</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>
    <td>17</td>
    <td>1</td>
    <td>15</td>
    <td>2</td>
    <td>1</td>

    <td>0</td>
    <td>10</td>
    <td>0</td>
    <td>2</td>
    <td>15</td>
    <td>2</td>
    <td>53</td>
    <td>0</td>
    <td>3</td>

    <td>3</td>
    <td>6</td>
    <td>0</td>
    <td>7</td>
    <td>16</td>
    <td>0</td>
    <td>9</td>
    <td>1</td>
    <td>1</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">August</th>
    <td>2</td>
    <td>0</td>
    <td>5</td>
    <td>0</td>
    <td>8</td>
    <td>15</td>
    <td>1</td>

    <td>17</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>5</td>
    <td>16</td>
    <td>0</td>

    <td>33</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>11</td>
    <td>0</td>
    <td>2</td>
    <td>25</td>
    <td>4</td>

    <td>8</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">September</th>
    <td>2</td>
    <td>0</td>
    <td>10</td>
    <td>0</td>
    <td>16</td>

    <td>22</td>
    <td>2</td>
    <td>19</td>
    <td>4</td>
    <td>2</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>

    <td>8</td>
    <td>0</td>
    <td>27</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>8</td>
    <td>0</td>
    <td>11</td>

    <td>31</td>
    <td>1</td>
    <td>9</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">October</th>
    <td>3</td>
    <td>1</td>
    <td>8</td>

    <td>0</td>
    <td>4</td>
    <td>28</td>
    <td>0</td>
    <td>15</td>
    <td>2</td>
    <td>1</td>
    <td>0</td>
    <td>1</td>

    <td>0</td>
    <td>1</td>
    <td>6</td>
    <td>0</td>
    <td>15</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>3</td>

    <td>0</td>
    <td>9</td>
    <td>26</td>
    <td>1</td>
    <td>8</td>
    <td>4</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">November</th>
    <td>0</td>

    <td>3</td>
    <td>3</td>
    <td>0</td>
    <td>6</td>
    <td>23</td>
    <td>1</td>
    <td>8</td>
    <td>1</td>
    <td>2</td>

    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>3</td>
    <td>7</td>
    <td>1</td>
    <td>20</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>8</td>
    <td>0</td>
    <td>3</td>
    <td>18</td>
    <td>3</td>
    <td>7</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">December</th>
    <td>1</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>4</td>
    <td>13</td>
    <td>2</td>

    <td>15</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>

    <td>29</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>3</td>
    <td>20</td>
    <td>1</td>

    <td>13</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    </tr></table></body>
</html>



Example 2 - syntax examples [css_selector, xpath]
Find all 'a' elements inside 'tr' table rows with css selector
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"
Find all 'a' elements inside 'tr' table rows with xpath
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"


Example 3 - syntax examples [xpath, .findall(), .getchildren()] 
['2050', 'January', '0', '1', '3', '0', '4', '16', '0', '6', '2', '2', '0', '3', '0', '3', '2', '0', '26', '1', '0', '0', '7', '0', '5', '6', '0', '8', '2', '0', '0', '0', '0', '0', '0', '0', '2', '0']
['February', '1', '0', '8', '0', '2', '4', '1', '6', '1', '2', '0', '3', '0', '0', '4', '0', '25', '0', '0', '1', '2', '0', '4', '14', '1', '1', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0']
['March', '0', '0', '4', '0', '4', '7', '0', '9', '2', '1', '0', '0', '0', '2', '9', '0', '45', '1', '0', '0', '7', '0', '10', '16', '0', '5', '1', '1', '0', '1', '0', '0', '0', '0', '4', '0']
['April', '1', '0', '5', '0', '3', '12', '1', '11', '0', '3', '0', '3', '0', '0', '3', '2', '34', '0', '0', '1', '2', '0', '6', '18', '1', '3', '0', '0', '0', '0', '0', '0', '0', '0', '5', '1']
['May', '7', '0', '6', '0', '8', '4', '1', '13', '0', '0', '2', '2', '0', '1', '7', '1', '30', '0', '0', '0', '7', '0', '5', '12', '0', '4', '1', '0', '0', '0', '0', '0', '0', '0', '6', '1']
['June', '0', '1', '14', '0', '7', '15', '0', '17', '1', '2', '0', '5', '0', '1', '3', '0', '24', '0', '0', '0', '5', '0', '6', '13', '1', '9', '1', '1', '0', '0', '0', '0', '0', '0', '2', '1']
['July', '0', '1', '6', '0', '8', '17', '1', '15', '2', '1', '0', '10', '0', '2', '15', '2', '53', '0', '3', '3', '6', '0', '7', '16', '0', '9', '1', '1', '0', '0', '0', '0', '1', '0', '2', '0']
['August', '2', '0', '5', '0', '8', '15', '1', '17', '0', '2', '0', '2', '0', '5', '16', '0', '33', '0', '0', '0', '11', '0', '2', '25', '4', '8', '0', '0', '0', '1', '0', '0', '0', '0', '3', '0']
['September', '2', '0', '10', '0', '16', '22', '2', '19', '4', '2', '0', '0', '0', '2', '8', '0', '27', '0', '1', '0', '8', '0', '11', '31', '1', '9', '0', '0', '0', '1', '0', '0', '0', '1', '1', '0']
['October', '3', '1', '8', '0', '4', '28', '0', '15', '2', '1', '0', '1', '0', '1', '6', '0', '15', '0', '1', '0', '3', '0', '9', '26', '1', '8', '4', '0', '0', '0', '0', '0', '0', '0', '1', '0']
['November', '0', '3', '3', '0', '6', '23', '1', '8', '1', '2', '0', '1', '0', '3', '7', '1', '20', '0', '0', '0', '8', '0', '3', '18', '3', '7', '0', '0', '0', '0', '0', '0', '0', '0', '3', '0']
['December', '1', '0', '4', '0', '4', '13', '2', '15', '1', '0', '0', '2', '0', '1', '2', '0', '29', '0', '1', '0', '7', '0', '3', '20', '1', '13', '0', '1', '0', '0', '0', '0', '0', '0', '3', '0']


Example 4 - following sibling []
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = [<Element td at 0xda53c0>, <Element td at 0xda5390>]
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = Intel

>