lxml Syntax Examples
Content:
- Python Code
- Resulting Output
Python Code
#!/usr/local/bin/python2.7
# -*- coding: UTF-8 -*-
"""lxmlScrapingExamples.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file]
Usage: lxmlScrapingExamples.py INURL OUTFILEPATH
Example: lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
"""
import sys
# joe professional opinion: package structure a bit goofy! 🙂
import lxml, lxml.html
def lxmlScrapingExamples(myinurl, myoutfilepath):
print myinurl
print myoutfilepath
#Example 1 - basic parsing of url - slightly altered from: http://stackoverflow.com/a/14303564/601770
print "\n\nExample 1 - basic parsing of url"
htmltree = lxml.html.parse(myinurl)
print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True))
#Example 2 - syntax examples [css_selector, xpath] - slightly altered from: http://stackoverflow.com/a/603630/601770
print "\n\nExample 2 - syntax examples [css_selector, xpath]"
# joe comment - i don't know why htmltree DOESN'T WORK DIRECTLY in this example it generates error:
# more lxml package/module/class/function assymetry?
'''
File "C:\1d\PythonPjs\kivyPjs\IBMsecurityAPIclientsPj\IBMsecurityAPIclient\ngExamples.py", line 28, in lxmlScrapingExamples
for a in mySearchTree.cssselect('tr a'):
AttributeError: 'lxml.etree._ElementTree' object has no attribute 'cssselect'
'''
#mySearchTree = htmltree
mySearchTree = lxml.html.fromstring(lxml.etree.tostring(htmltree))
# Find all 'a' elements inside 'tr' table rows with css selector
print "Find all 'a' elements inside 'tr' table rows with css selector"
for itm in mySearchTree.cssselect('tr a'):
print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))
# Find all 'a' elements inside 'tr' table rows with xpath
print "Find all 'a' elements inside 'tr' table rows with xpath"
for itm in mySearchTree.xpath('.//tr/*/a'):
print 'found "%s" link to href "%s"' % (itm.text, itm.get('href'))
#Example 3 - syntax examples [xpath, .findall(), .getchildren()] - slightly altered from: http://stackoverflow.com/a/9920703/601770
print "\n\nExample 3 - syntax examples [xpath, .findall(), .getchildren()] "
page = htmltree
rows = page.xpath("body/table")[1].findall("tr") # table [1] is the 2nd table in MY example html
data = list()
for row in rows:
data.append([c.text for c in row.getchildren()])
for itm in data[4:]: print(itm)
#Example 4 - following sibling [] - slightly altered from: http://stackoverflow.com/questions/3139402/how-to-select-following-sibling-xml-tag-using-xpath
print "\n\nExample 4 - following sibling []"
sibEx = '''
<html>
<head>
<title>following sibling</title>
</head>
<body>
<table border>
<tr>
<td class="name">Brand</td>
<td class="desc">Intel</td>
</tr>
<tr>
<td class="name">Series</td>
<td class="desc">Core i5</td>
</tr>
<tr>
<td class="name">Cores</td>
<td class="desc">4</td>
</tr>
<tr>
<td class="name">Socket</td>
<td class="desc">LGA 1156</td>
</tr>
<tr>
<td class="name">Brand</td>
<td class="desc">AMD</td>
</tr>
<tr>
<td class="name">Series</td>
<td class="desc">Phenom II X4</td>
</tr>
<tr>
<td class="name">Cores</td>
<td class="desc">4</td>
</tr>
<tr>
<td class="name">Socket</td>
<td class="desc">Socket AM3</td>
</tr>
</table>
</body>
</html>
'''
parsedDocument = lxml.html.fromstring(sibEx)
# bad
#rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']")
#rlist = parsedDocument.xpath("tr[td[@class='name'] ='Brand']/td[@class='desc']")
#r = parsedDocument.xpath(tr/td[@class="name"])=='Brand')
# r = parsedDocument.tr[td[@class='name'] ='Brand'].text
#r = parsedDocument.tr[td[@class='name'] ='Brand']/td[@class='desc'].text
#if(parsedDocument.xpath(tr/td[@class="name"])=='Brand'):
# good
#print "parsedDocument.xpath('/html/body/table/tr') = %s"%(parsedDocument.xpath('/html/body/table/tr'))
print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']"))
print """parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = %s"""%(parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text)
print '\n\n\n'
NUM_ARGS = 2
def main():
args = sys.argv[1:]
if len(args) != NUM_ARGS or "-h" in args or "--help" in args:
print __doc__
s = raw_input('hit return to quit')
sys.exit(2)
lxmlScrapingExamples(args[0], args[1])
if __name__ == '__main__':
main()
Resulting Output
>lxmlScrapingExamples.py http://joecodeswell.org/examples/dlwebfiles/htmlExample.html lxmlScrapingOutput.txt
http://joecodeswell.org/examples/dlwebfiles/htmlExample.html
lxmlScrapingOutput.txt
Example 1 - basic parsing of url
lxml.etree.tostring(htmltree, pretty_print=True) = <!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=windows-1252"/>
<title>lxml htmlExamples.html</title>
</head>
<body>
<h1>lxml htmlExamples.html for Joe Codeswell examples - dlwebfiles</h1>
<h2>Example 1</h2>
<ul><li><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></li>
<li><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid">carol.mid</a></li>
<li><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></li>
</ul><h2>Example 2</h2>
<table align="left" border="0" cellspacing="0" cellpadding="0" width="100%"><tr align="left" valign="top"><th>Name</th>
<th>File Name & Link</th>
</tr><tr align="left" valign="top"><td>Ave Verum</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/aveverum.mid">aveverum.mid</a></td></tr><tr align="left" valign="top"><td>A Carol</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid">carol.mid</a></td></tr><tr align="left" valign="top"><td>Steiner Amen?</td><td><a href="http://joecodeswell.org/examples/dlwebfiles/steiner.mid">steiner.mid</a></td></tr></table><h2>Example 3</h2>
<table border=""><tr align="LEFT"><th colspan="38">Main Subject</th>
</tr><tr align="LEFT"><th colspan="2"> </th>
<th valign="TOP" colspan="18">part1</th>
<th valign="TOP" colspan="18">part2</th>
</tr><tr align="LEFT"><th colspan="2"> </th>
<th valign="TOP" colspan="9">sub-part1</th>
<th valign="TOP" colspan="9">sub-part2</th>
<th valign="TOP" colspan="9">sub-part3</th>
<th valign="TOP" colspan="9">sub-part4</th>
</tr><tr align="LEFT"><th colspan="2"> </th>
<th valign="TOP" colspan="1">subject1</th>
<th valign="TOP" colspan="1">subject2</th>
<th valign="TOP" colspan="1">subject10</th>
<th valign="TOP" colspan="1">subject11</th>
<th valign="TOP" colspan="1">subject12</th>
<th valign="TOP" colspan="1">subject13</th>
<th valign="TOP" colspan="1">subject14</th>
<th valign="TOP" colspan="1">subject15</th>
<th valign="TOP" colspan="1">subject16</th>
<th valign="TOP" colspan="1">subject17</th>
<th valign="TOP" colspan="1">subject18</th>
<th valign="TOP" colspan="1">subject19</th>
<th valign="TOP" colspan="1">subject20</th>
<th valign="TOP" colspan="1">subject21</th>
<th valign="TOP" colspan="1">subject22</th>
<th valign="TOP" colspan="1">subject23</th>
<th valign="TOP" colspan="1">subject24</th>
<th valign="TOP" colspan="1">subject25</th>
<th valign="TOP" colspan="1">subject26</th>
<th valign="TOP" colspan="1">subject27</th>
<th valign="TOP" colspan="1">subject28</th>
<th valign="TOP" colspan="1">subject29</th>
<th valign="TOP" colspan="1">subject30</th>
<th valign="TOP" colspan="1">subject31</th>
<th valign="TOP" colspan="1">subject32</th>
<th valign="TOP" colspan="1">subject33</th>
<th valign="TOP" colspan="1">subject34</th>
<th valign="TOP" colspan="1">subject35</th>
<th valign="TOP" colspan="1">subject36</th>
</tr><tr align="RIGHT"><th align="LEFT" valign="TOP" rowspan="12">2050</th>
<th align="LEFT">January</th>
<td>0</td>
<td>1</td>
<td>3</td>
<td>0</td>
<td>4</td>
<td>16</td>
<td>0</td>
<td>6</td>
<td>2</td>
<td>2</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>3</td>
<td>2</td>
<td>0</td>
<td>26</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>7</td>
<td>0</td>
<td>5</td>
<td>6</td>
<td>0</td>
<td>8</td>
<td>2</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">February</th>
<td>1</td>
<td>0</td>
<td>8</td>
<td>0</td>
<td>2</td>
<td>4</td>
<td>1</td>
<td>6</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>0</td>
<td>4</td>
<td>0</td>
<td>25</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>4</td>
<td>14</td>
<td>1</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">March</th>
<td>0</td>
<td>0</td>
<td>4</td>
<td>0</td>
<td>4</td>
<td>7</td>
<td>0</td>
<td>9</td>
<td>2</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>9</td>
<td>0</td>
<td>45</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>7</td>
<td>0</td>
<td>10</td>
<td>16</td>
<td>0</td>
<td>5</td>
<td>1</td>
<td>1</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>4</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">April</th>
<td>1</td>
<td>0</td>
<td>5</td>
<td>0</td>
<td>3</td>
<td>12</td>
<td>1</td>
<td>11</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>0</td>
<td>3</td>
<td>2</td>
<td>34</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>6</td>
<td>18</td>
<td>1</td>
<td>3</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5</td>
<td>1</td>
</tr><tr align="RIGHT"><th align="LEFT">May</th>
<td>7</td>
<td>0</td>
<td>6</td>
<td>0</td>
<td>8</td>
<td>4</td>
<td>1</td>
<td>13</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>2</td>
<td>0</td>
<td>1</td>
<td>7</td>
<td>1</td>
<td>30</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>7</td>
<td>0</td>
<td>5</td>
<td>12</td>
<td>0</td>
<td>4</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>6</td>
<td>1</td>
</tr><tr align="RIGHT"><th align="LEFT">June</th>
<td>0</td>
<td>1</td>
<td>14</td>
<td>0</td>
<td>7</td>
<td>15</td>
<td>0</td>
<td>17</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>5</td>
<td>0</td>
<td>1</td>
<td>3</td>
<td>0</td>
<td>24</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5</td>
<td>0</td>
<td>6</td>
<td>13</td>
<td>1</td>
<td>9</td>
<td>1</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>1</td>
</tr><tr align="RIGHT"><th align="LEFT">July</th>
<td>0</td>
<td>1</td>
<td>6</td>
<td>0</td>
<td>8</td>
<td>17</td>
<td>1</td>
<td>15</td>
<td>2</td>
<td>1</td>
<td>0</td>
<td>10</td>
<td>0</td>
<td>2</td>
<td>15</td>
<td>2</td>
<td>53</td>
<td>0</td>
<td>3</td>
<td>3</td>
<td>6</td>
<td>0</td>
<td>7</td>
<td>16</td>
<td>0</td>
<td>9</td>
<td>1</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>2</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">August</th>
<td>2</td>
<td>0</td>
<td>5</td>
<td>0</td>
<td>8</td>
<td>15</td>
<td>1</td>
<td>17</td>
<td>0</td>
<td>2</td>
<td>0</td>
<td>2</td>
<td>0</td>
<td>5</td>
<td>16</td>
<td>0</td>
<td>33</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>11</td>
<td>0</td>
<td>2</td>
<td>25</td>
<td>4</td>
<td>8</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>3</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">September</th>
<td>2</td>
<td>0</td>
<td>10</td>
<td>0</td>
<td>16</td>
<td>22</td>
<td>2</td>
<td>19</td>
<td>4</td>
<td>2</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>8</td>
<td>0</td>
<td>27</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>8</td>
<td>0</td>
<td>11</td>
<td>31</td>
<td>1</td>
<td>9</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>1</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">October</th>
<td>3</td>
<td>1</td>
<td>8</td>
<td>0</td>
<td>4</td>
<td>28</td>
<td>0</td>
<td>15</td>
<td>2</td>
<td>1</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>1</td>
<td>6</td>
<td>0</td>
<td>15</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>3</td>
<td>0</td>
<td>9</td>
<td>26</td>
<td>1</td>
<td>8</td>
<td>4</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">November</th>
<td>0</td>
<td>3</td>
<td>3</td>
<td>0</td>
<td>6</td>
<td>23</td>
<td>1</td>
<td>8</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>3</td>
<td>7</td>
<td>1</td>
<td>20</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>8</td>
<td>0</td>
<td>3</td>
<td>18</td>
<td>3</td>
<td>7</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>3</td>
<td>0</td>
</tr><tr align="RIGHT"><th align="LEFT">December</th>
<td>1</td>
<td>0</td>
<td>4</td>
<td>0</td>
<td>4</td>
<td>13</td>
<td>2</td>
<td>15</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td>0</td>
<td>1</td>
<td>2</td>
<td>0</td>
<td>29</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>7</td>
<td>0</td>
<td>3</td>
<td>20</td>
<td>1</td>
<td>13</td>
<td>0</td>
<td>1</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>3</td>
<td>0</td>
</tr></table></body>
</html>
Example 2 - syntax examples [css_selector, xpath]
Find all 'a' elements inside 'tr' table rows with css selector
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"
Find all 'a' elements inside 'tr' table rows with xpath
found "aveverum.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/aveverum.mid"
found "carol.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/carol.mid.mid"
found "steiner.mid" link to href "http://joecodeswell.org/examples/dlwebfiles/steiner.mid"
Example 3 - syntax examples [xpath, .findall(), .getchildren()]
['2050', 'January', '0', '1', '3', '0', '4', '16', '0', '6', '2', '2', '0', '3', '0', '3', '2', '0', '26', '1', '0', '0', '7', '0', '5', '6', '0', '8', '2', '0', '0', '0', '0', '0', '0', '0', '2', '0']
['February', '1', '0', '8', '0', '2', '4', '1', '6', '1', '2', '0', '3', '0', '0', '4', '0', '25', '0', '0', '1', '2', '0', '4', '14', '1', '1', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0']
['March', '0', '0', '4', '0', '4', '7', '0', '9', '2', '1', '0', '0', '0', '2', '9', '0', '45', '1', '0', '0', '7', '0', '10', '16', '0', '5', '1', '1', '0', '1', '0', '0', '0', '0', '4', '0']
['April', '1', '0', '5', '0', '3', '12', '1', '11', '0', '3', '0', '3', '0', '0', '3', '2', '34', '0', '0', '1', '2', '0', '6', '18', '1', '3', '0', '0', '0', '0', '0', '0', '0', '0', '5', '1']
['May', '7', '0', '6', '0', '8', '4', '1', '13', '0', '0', '2', '2', '0', '1', '7', '1', '30', '0', '0', '0', '7', '0', '5', '12', '0', '4', '1', '0', '0', '0', '0', '0', '0', '0', '6', '1']
['June', '0', '1', '14', '0', '7', '15', '0', '17', '1', '2', '0', '5', '0', '1', '3', '0', '24', '0', '0', '0', '5', '0', '6', '13', '1', '9', '1', '1', '0', '0', '0', '0', '0', '0', '2', '1']
['July', '0', '1', '6', '0', '8', '17', '1', '15', '2', '1', '0', '10', '0', '2', '15', '2', '53', '0', '3', '3', '6', '0', '7', '16', '0', '9', '1', '1', '0', '0', '0', '0', '1', '0', '2', '0']
['August', '2', '0', '5', '0', '8', '15', '1', '17', '0', '2', '0', '2', '0', '5', '16', '0', '33', '0', '0', '0', '11', '0', '2', '25', '4', '8', '0', '0', '0', '1', '0', '0', '0', '0', '3', '0']
['September', '2', '0', '10', '0', '16', '22', '2', '19', '4', '2', '0', '0', '0', '2', '8', '0', '27', '0', '1', '0', '8', '0', '11', '31', '1', '9', '0', '0', '0', '1', '0', '0', '0', '1', '1', '0']
['October', '3', '1', '8', '0', '4', '28', '0', '15', '2', '1', '0', '1', '0', '1', '6', '0', '15', '0', '1', '0', '3', '0', '9', '26', '1', '8', '4', '0', '0', '0', '0', '0', '0', '0', '1', '0']
['November', '0', '3', '3', '0', '6', '23', '1', '8', '1', '2', '0', '1', '0', '3', '7', '1', '20', '0', '0', '0', '8', '0', '3', '18', '3', '7', '0', '0', '0', '0', '0', '0', '0', '0', '3', '0']
['December', '1', '0', '4', '0', '4', '13', '2', '15', '1', '0', '0', '2', '0', '1', '2', '0', '29', '0', '1', '0', '7', '0', '3', '20', '1', '13', '0', '1', '0', '0', '0', '0', '0', '0', '3', '0']
Example 4 - following sibling []
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']") = [<Element td at 0xda53c0>, <Element td at 0xda5390>]
parsedDocument.xpath("//tr[td[@class='name'] ='Brand']/td[@class='desc']")[0].text = Intel
>
#html, #lxml, #python, #screen-scraping