lxml.html basic parse methods for [url, file, string]

lxml.html examples of parsing from

  • URLs
  • Files
  • Strings

URLs

import lxml.html
htmltree = lxml.html.parse('http://joecodeswell.com')

htmltree.xpath("//title")[0].text

'''
OUTPUT:
'JoeCodeswell.com'
'''

Files

N.B. Save ‘http://joecodeswell.com’ as a file named ‘JoeCodeswell.com.htm’.
Make sure to cd to the dir containing the file before running the following.

import lxml.html
htmltree = lxml.html.parse('JoeCodeswell.com.htm')

htmltree.xpath("//title")[0].text

'''
OUTPUT:
'JoeCodeswell.com'
'''

Strings

N.B. Save ‘http://joecodeswell.com’ as a file named ‘JoeCodeswell.com.htm’.
Make sure to cd to the dir containing the file before running the following.

import lxml.html

f = open('JoeCodeswell.com.htm', 'r'); the_string = f.read(); f.close()
htmltree = lxml.html.fromstring(the_string)

htmltree.xpath("//title")[0].text

'''
OUTPUT:
'JoeCodeswell.com'
'''

lxml HTML Scraping Syntax Examples

lxml Syntax Examples

Content:

  • Python Code
  • Resulting Output

Python Code

#!/usr/local/bin/python2.7
# -*- coding: UTF-8 -*-
"""lxmlScrapingExamples.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file]
Usage:   lxmlScrapingExamples.py INURL                                                  OUTFILEPATH
Example: lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
"""
import sys

# joe professional opinion: package structure a bit goofy!   🙂
import lxml, lxml.html


def lxmlScrapingExamples(myinurl, myoutfilepath):
    print myinurl
    print myoutfilepath

    #Example 1 - basic parsing of url - slightly altered from: http://stackoverflow.com/a/14303564/601770
    print "\n\nExample 1 - basic parsing of url"
    htmltree = lxml.html.parse(myinurl)
    print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True))



    #Example 2 - syntax examples [css_selector, xpath] - slightly altered from: http://stackoverflow.com/a/603630/601770
    print "\n\nExample 2 - syntax examples [css_selector, xpath]"
    # joe comment - i don't know why htmltree DOESN'T WORK DIRECTLY in this example it generates error:
    #     more lxml package/module/class/function assymetry?
    '''
    File "C:\1d\PythonPjs\kivyPjs\IBMsecurityAPIclientsPj\IBMsecurityAPIclient\ngExamples.py", line 28, in lxmlScrapingExamples
        for a in mySearchTree.cssselect('tr a'):
    AttributeError: 'lxml.etree._ElementTree' object has no attribute 'cssselect'    
    '''
    #mySearchTree = htmltree
    mySearchTree = lxml.html.fromstring(lxml.etree.tostring(htmltree))         
    # Find all 'a' elements inside 'tr' table rows with css selector
    print "Find all 'a' elements inside 'tr' table rows with css selector"
    for itm in mySearchTree.cssselect('tr a'):
        print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))    
    # Find all 'a' elements inside 'tr' table rows with xpath
    print "Find all 'a' elements inside 'tr' table rows with xpath"
    for itm in mySearchTree.xpath('.//tr/*/a'):
        print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))

    #Example 3 - syntax examples [xpath, .findall(), .getchildren()] - slightly altered from: http://stackoverflow.com/a/9920703/601770
    print "\n\nExample 3 - syntax examples [xpath, .findall(), .getchildren()] "
    page = htmltree
    rows = page.xpath("body/table")[1].findall("tr")   # table [1] is the 2nd table in MY example html
    data = list()
    for row in rows:
        data.append([c.text for c in row.getchildren()])
    for itm in data[4:]: print(itm)

    #Example 4 - following sibling [] - slightly altered from: http://stackoverflow.com/questions/3139402/how-to-select-following-sibling-xml-tag-using-xpath
    print "\n\nExample 4 - following sibling []"
    sibEx = '''
    <html>
    <head>
    <title>following sibling</title>
    </head>
    <body>
    <table border>    
    <tr>
        <td class="name">Brand</td>
        <td class="desc">Intel</td>
    </tr>
    <tr>
        <td class="name">Series</td>
        <td class="desc">Core i5</td>
    </tr>
    <tr>
        <td class="name">Cores</td>
        <td class="desc">4</td>
    </tr>
    <tr>
        <td class="name">Socket</td>
        <td class="desc">LGA 1156</td>    
    </tr>

    <tr>
        <td class="name">Brand</td>
        <td class="desc">AMD</td>
    </tr>
    <tr>
        <td class="name">Series</td>
        <td class="desc">Phenom II X4</td>
    </tr>
    <tr>
        <td class="name">Cores</td>
        <td class="desc">4</td>
    </tr>
    <tr>
        <td class="name">Socket</td>
        <td class="desc">Socket AM3</td>
    </tr>
    </table>
    </body>
    </html>    
    '''
    parsedDocument = lxml.html.fromstring(sibEx)

    # bad
    #rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']")
    #rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']/td[@class='desc']")
    #r = parsedDocument.xpath(tr/td[@class="name"])=='Brand')
    # r = parsedDocument.tr[td[@class='name'] ='Brand'].text
    #r = parsedDocument.tr[td[@class='name'] ='Brand']/td[@class='desc'].text
    #if(parsedDocument.xpath(tr/td[@class="name"])=='Brand'):

    # good
    #print "parsedDocument.xpath('/html/body/table/tr') = %s"%(parsedDocument.xpath('/html/body/table/tr'))
    print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']"))
    print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text)



    print '\n\n\n'


NUM_ARGS = 2
def main():
    args = sys.argv[1:]
    if len(args) != NUM_ARGS or "-h" in args or "--help" in args:
        print __doc__
        s = raw_input('hit return to quit')
        sys.exit(2)
    lxmlScrapingExamples(args[0], args[1])

if __name__ == '__main__':
    main()

Resulting Output

>lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
http://joecodeswell.org/examples/dlwebfiles/htmlExample.html
lxmlScrapingOutput.txt


Example 1 - basic parsing of url
lxml.etree.tostring(htmltree, pretty_print=True) = <!DOCTYPE html>
<html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=windows-1252"/>
    <title>lxml htmlExamples.html</title>
  </head>
  <body>
    <h1>lxml htmlExamples.html for Joe Codeswell examples - dlwebfiles</h1>

    <h2>Example 1</h2>
    <ul><li><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid">carol.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></li>
    </ul><h2>Example 2</h2>
    <table align="left" border="0" cellspacing="0" cellpadding="0" width="100%"><tr align="left" valign="top"><th>Name</th>
        <th>File Name & Link</th>
      </tr><tr align="left" valign="top"><td>Ave Verum</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></td></tr><tr align="left" valign="top"><td>A Carol</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid">carol.mid</a></td></tr><tr align="left" valign="top"><td>Steiner Amen?</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></td></tr></table><h2>Example 3</h2>
    <table border=""><tr align="LEFT"><th colspan="38">Main Subject</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>

    <th valign="TOP" colspan="18">part1</th>
    <th valign="TOP" colspan="18">part2</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>
    <th valign="TOP" colspan="9">sub-part1</th>
    <th valign="TOP" colspan="9">sub-part2</th>
    <th valign="TOP" colspan="9">sub-part3</th>
    <th valign="TOP" colspan="9">sub-part4</th>
    </tr><tr align="LEFT"><th colspan="2"> </th>
    <th valign="TOP" colspan="1">subject1</th>
    <th valign="TOP" colspan="1">subject2</th>

    <th valign="TOP" colspan="1">subject10</th>
    <th valign="TOP" colspan="1">subject11</th>
    <th valign="TOP" colspan="1">subject12</th>
    <th valign="TOP" colspan="1">subject13</th>
    <th valign="TOP" colspan="1">subject14</th>
    <th valign="TOP" colspan="1">subject15</th>
    <th valign="TOP" colspan="1">subject16</th>

    <th valign="TOP" colspan="1">subject17</th>
    <th valign="TOP" colspan="1">subject18</th>
    <th valign="TOP" colspan="1">subject19</th>
    <th valign="TOP" colspan="1">subject20</th>
    <th valign="TOP" colspan="1">subject21</th>
    <th valign="TOP" colspan="1">subject22</th>
    <th valign="TOP" colspan="1">subject23</th>
    <th valign="TOP" colspan="1">subject24</th>
    <th valign="TOP" colspan="1">subject25</th>

    <th valign="TOP" colspan="1">subject26</th>
    <th valign="TOP" colspan="1">subject27</th>
    <th valign="TOP" colspan="1">subject28</th>
    <th valign="TOP" colspan="1">subject29</th>
    <th valign="TOP" colspan="1">subject30</th>
    <th valign="TOP" colspan="1">subject31</th>
    <th valign="TOP" colspan="1">subject32</th>
    <th valign="TOP" colspan="1">subject33</th>
    <th valign="TOP" colspan="1">subject34</th>

    <th valign="TOP" colspan="1">subject35</th>
    <th valign="TOP" colspan="1">subject36</th>
    </tr><tr align="RIGHT"><th align="LEFT" valign="TOP" rowspan="12">2050</th>
    <th align="LEFT">January</th>
    <td>0</td>
    <td>1</td>
    <td>3</td>
    <td>0</td>

    <td>4</td>
    <td>16</td>
    <td>0</td>
    <td>6</td>
    <td>2</td>
    <td>2</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>

    <td>3</td>
    <td>2</td>
    <td>0</td>
    <td>26</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>

    <td>5</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>
    <td>2</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">February</th>
    <td>1</td>
    <td>0</td>

    <td>8</td>
    <td>0</td>
    <td>2</td>
    <td>4</td>
    <td>1</td>
    <td>6</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>25</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>

    <td>2</td>
    <td>0</td>
    <td>4</td>
    <td>14</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">March</th>

    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>4</td>
    <td>7</td>
    <td>0</td>
    <td>9</td>
    <td>2</td>

    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>9</td>
    <td>0</td>
    <td>45</td>
    <td>1</td>

    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>10</td>
    <td>16</td>
    <td>0</td>
    <td>5</td>
    <td>1</td>

    <td>1</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>

    </tr><tr align="RIGHT"><th align="LEFT">April</th>
    <td>1</td>
    <td>0</td>
    <td>5</td>
    <td>0</td>
    <td>3</td>
    <td>12</td>
    <td>1</td>

    <td>11</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>3</td>
    <td>2</td>

    <td>34</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>
    <td>6</td>
    <td>18</td>
    <td>1</td>

    <td>3</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>5</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">May</th>
    <td>7</td>
    <td>0</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>

    <td>4</td>
    <td>1</td>
    <td>13</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>2</td>
    <td>0</td>
    <td>1</td>

    <td>7</td>
    <td>1</td>
    <td>30</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>5</td>

    <td>12</td>
    <td>0</td>
    <td>4</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>6</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">June</th>
    <td>0</td>
    <td>1</td>
    <td>14</td>

    <td>0</td>
    <td>7</td>
    <td>15</td>
    <td>0</td>
    <td>17</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>
    <td>5</td>

    <td>0</td>
    <td>1</td>
    <td>3</td>
    <td>0</td>
    <td>24</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>5</td>

    <td>0</td>
    <td>6</td>
    <td>13</td>
    <td>1</td>
    <td>9</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>1</td>
    </tr><tr align="RIGHT"><th align="LEFT">July</th>
    <td>0</td>

    <td>1</td>
    <td>6</td>
    <td>0</td>
    <td>8</td>
    <td>17</td>
    <td>1</td>
    <td>15</td>
    <td>2</td>
    <td>1</td>

    <td>0</td>
    <td>10</td>
    <td>0</td>
    <td>2</td>
    <td>15</td>
    <td>2</td>
    <td>53</td>
    <td>0</td>
    <td>3</td>

    <td>3</td>
    <td>6</td>
    <td>0</td>
    <td>7</td>
    <td>16</td>
    <td>0</td>
    <td>9</td>
    <td>1</td>
    <td>1</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">August</th>
    <td>2</td>
    <td>0</td>
    <td>5</td>
    <td>0</td>
    <td>8</td>
    <td>15</td>
    <td>1</td>

    <td>17</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>5</td>
    <td>16</td>
    <td>0</td>

    <td>33</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>11</td>
    <td>0</td>
    <td>2</td>
    <td>25</td>
    <td>4</td>

    <td>8</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">September</th>
    <td>2</td>
    <td>0</td>
    <td>10</td>
    <td>0</td>
    <td>16</td>

    <td>22</td>
    <td>2</td>
    <td>19</td>
    <td>4</td>
    <td>2</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>

    <td>8</td>
    <td>0</td>
    <td>27</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>8</td>
    <td>0</td>
    <td>11</td>

    <td>31</td>
    <td>1</td>
    <td>9</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>1</td>
    <td>1</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">October</th>
    <td>3</td>
    <td>1</td>
    <td>8</td>

    <td>0</td>
    <td>4</td>
    <td>28</td>
    <td>0</td>
    <td>15</td>
    <td>2</td>
    <td>1</td>
    <td>0</td>
    <td>1</td>

    <td>0</td>
    <td>1</td>
    <td>6</td>
    <td>0</td>
    <td>15</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>3</td>

    <td>0</td>
    <td>9</td>
    <td>26</td>
    <td>1</td>
    <td>8</td>
    <td>4</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">November</th>
    <td>0</td>

    <td>3</td>
    <td>3</td>
    <td>0</td>
    <td>6</td>
    <td>23</td>
    <td>1</td>
    <td>8</td>
    <td>1</td>
    <td>2</td>

    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>3</td>
    <td>7</td>
    <td>1</td>
    <td>20</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>8</td>
    <td>0</td>
    <td>3</td>
    <td>18</td>
    <td>3</td>
    <td>7</td>
    <td>0</td>
    <td>0</td>

    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>3</td>
    <td>0</td>
    </tr><tr align="RIGHT"><th align="LEFT">December</th>
    <td>1</td>
    <td>0</td>
    <td>4</td>
    <td>0</td>
    <td>4</td>
    <td>13</td>
    <td>2</td>

    <td>15</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>2</td>
    <td>0</td>
    <td>1</td>
    <td>2</td>
    <td>0</td>

    <td>29</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>7</td>
    <td>0</td>
    <td>3</td>
    <td>20</td>
    <td>1</td>

    <td>13</td>
    <td>0</td>
    <td>1</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>
    <td>0</td>

    <td>3</td>
    <td>0</td>
    </tr></table></body>
</html>



Example 2 - syntax examples [css_selector, xpath]
Find all 'a' elements inside 'tr' table rows with css selector
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"
Find all 'a' elements inside 'tr' table rows with xpath
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"


Example 3 - syntax examples [xpath, .findall(), .getchildren()] 
['2050', 'January', '0', '1', '3', '0', '4', '16', '0', '6', '2', '2', '0', '3', '0', '3', '2', '0', '26', '1', '0', '0', '7', '0', '5', '6', '0', '8', '2', '0', '0', '0', '0', '0', '0', '0', '2', '0']
['February', '1', '0', '8', '0', '2', '4', '1', '6', '1', '2', '0', '3', '0', '0', '4', '0', '25', '0', '0', '1', '2', '0', '4', '14', '1', '1', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0']
['March', '0', '0', '4', '0', '4', '7', '0', '9', '2', '1', '0', '0', '0', '2', '9', '0', '45', '1', '0', '0', '7', '0', '10', '16', '0', '5', '1', '1', '0', '1', '0', '0', '0', '0', '4', '0']
['April', '1', '0', '5', '0', '3', '12', '1', '11', '0', '3', '0', '3', '0', '0', '3', '2', '34', '0', '0', '1', '2', '0', '6', '18', '1', '3', '0', '0', '0', '0', '0', '0', '0', '0', '5', '1']
['May', '7', '0', '6', '0', '8', '4', '1', '13', '0', '0', '2', '2', '0', '1', '7', '1', '30', '0', '0', '0', '7', '0', '5', '12', '0', '4', '1', '0', '0', '0', '0', '0', '0', '0', '6', '1']
['June', '0', '1', '14', '0', '7', '15', '0', '17', '1', '2', '0', '5', '0', '1', '3', '0', '24', '0', '0', '0', '5', '0', '6', '13', '1', '9', '1', '1', '0', '0', '0', '0', '0', '0', '2', '1']
['July', '0', '1', '6', '0', '8', '17', '1', '15', '2', '1', '0', '10', '0', '2', '15', '2', '53', '0', '3', '3', '6', '0', '7', '16', '0', '9', '1', '1', '0', '0', '0', '0', '1', '0', '2', '0']
['August', '2', '0', '5', '0', '8', '15', '1', '17', '0', '2', '0', '2', '0', '5', '16', '0', '33', '0', '0', '0', '11', '0', '2', '25', '4', '8', '0', '0', '0', '1', '0', '0', '0', '0', '3', '0']
['September', '2', '0', '10', '0', '16', '22', '2', '19', '4', '2', '0', '0', '0', '2', '8', '0', '27', '0', '1', '0', '8', '0', '11', '31', '1', '9', '0', '0', '0', '1', '0', '0', '0', '1', '1', '0']
['October', '3', '1', '8', '0', '4', '28', '0', '15', '2', '1', '0', '1', '0', '1', '6', '0', '15', '0', '1', '0', '3', '0', '9', '26', '1', '8', '4', '0', '0', '0', '0', '0', '0', '0', '1', '0']
['November', '0', '3', '3', '0', '6', '23', '1', '8', '1', '2', '0', '1', '0', '3', '7', '1', '20', '0', '0', '0', '8', '0', '3', '18', '3', '7', '0', '0', '0', '0', '0', '0', '0', '0', '3', '0']
['December', '1', '0', '4', '0', '4', '13', '2', '15', '1', '0', '0', '2', '0', '1', '2', '0', '29', '0', '1', '0', '7', '0', '3', '20', '1', '13', '0', '1', '0', '0', '0', '0', '0', '0', '3', '0']


Example 4 - following sibling []
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = [<Element td at 0xda53c0>, <Element td at 0xda5390>]
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = Intel

>

Scrape HTML Files with lxml

GREAT lxml example links

HTML Node vs Element

W3C HTML Nodes may be:

  • Document — Element (maximum of one), ProcessingInstruction, Comment, DocumentType
  • DocumentFragment — Element, ProcessingInstruction, Comment, Text, CDATASection, EntityReference
  • DocumentType — no children
  • EntityReference — Element, ProcessingInstruction, Comment, Text, CDATASection, EntityReference
  • Element — Element, Text, Comment, ProcessingInstruction, CDATASection, EntityReference
  • Attr — Text, EntityReference
  • ProcessingInstruction — no children
  • Comment — no children
  • Text — no children
  • CDATASection — no children
  • Entity — Element, ProcessingInstruction, Comment, Text, CDATASection, EntityReference
  • Notation — no children # My Scrape HTML File Example

Background

This, http://joecodeswell.org/examples/dlwebfiles/index.html, is the URL we will be scraping using lmxl, to find files to download, and using urllib to download them. Here is what the URL content looks like.

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
  <head>
    <meta http-equiv="content-type" content="text/html; charset=windows-1252">
    <title>dlwebfiles</title>
  </head>
  <body>
    <h1>Index of Joe Codeswell examples - dlwebfiles</h1>
    <ul>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid">carol.mid</a></li>
      <li><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></li>
    </ul>
  </body>
</html>

Step-1

Create a folder structure on your local machine == “example_folder/mid".

Step-2

Put the following Python code into a file in example_folder, naming it “retrieveMidis.py“.

# -*- coding: UTF-8 -*-
# retrieveMidis.py

import os, lxml.html, urllib

inScrapeUrl = 'http://joecodeswell.org/examples/dlwebfiles/index.html'
outDataFolderPath = os.path.join('mid')

# parse the html
htmltree = lxml.html.parse(inScrapeUrl)

# retrieve the midi files to the ./mid dir
theLiList = htmltree.xpath('/html/body/ul/li')  
opener = urllib.URLopener()
for li in theLiList:
    # see http://www.w3schools.com/xpath/xpath_syntax.asp
    theHref = li.xpath('a')[0].attrib.get('href')
    theBasename = os.path.basename(theHref)
    theExtension = os.path.splitext(theBasename)[1]
    if len(theBasename) != 0:
        print "theHref = %s"%(theHref)
        print "theBasename = %s"%(theBasename)
        print "len(theBasename) = %s"%(len(theBasename))
        print "theExtension = %s"%(theExtension)
        print "os.path.join(outDataFolderPath,theBasenme) = %s"%(os.path.join(outDataFolderPath,theBasename))
        print
        print
        opener.retrieve(theHref, os.path.join(outDataFolderPath,theBasename))

Step-3

Run retrieveMidis.py.

Here’s what the output looks like on Win XP.

>retrieveMidis.py
theHref = http://joecodeswell.org/examples/dlwebfiles/aveverum.mid
theBasename = aveverum.mid
len(theBasename) = 12
theExtension = .mid
os.path.join(outDataFolderPath,theBasenme) = midaveverum.mid


theHref = http://joecodeswell.org/examples/dlwebfiles/carol.mid
theBasename = carol.mid
len(theBasename) = 9
theExtension = .mid
os.path.join(outDataFolderPath,theBasenme) = midcarol.mid


theHref = http://joecodeswell.org/examples/dlwebfiles/steiner.mid
theBasename = steiner.mid
len(theBasename) = 11
theExtension = .mid
os.path.join(outDataFolderPath,theBasenme) = midsteiner.mid