More lxml Syntax Examples
Continued from lxml HTML Scraping Syntax Examples
Content:
- Python Code
- Resulting Output
Python Code
#!/usr/local/bin/python2.7 # -*- coding: UTF-8 -*- """lxmlScrapingExamplesMore.py takes INURL [URL to an html file] Producing OUTFILEPATH [a scrapped text file] Usage: lxmlScrapingExamplesMore.py INURL OUTFILEPATH Example: lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt """ import sys,os # joe professional opinion: package structure a bit goofy! 🙂 import lxml, lxml.html def lxmlScrapingExamples(myinurl, myoutfilepath): myinurl = 'http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm' # what gets called print myinurl print myoutfilepath #Example 1 redo for myinurl new value print "\n\nExample 1 - basic parsing of url" htmltree = lxml.html.parse(myinurl) # print "lxml.etree.tostring(htmltree, pretty_print=True) = %s"%(lxml.etree.tostring(htmltree, pretty_print=True)) #Example 5 - xpath tag with class=value N.B. backslashes for newLines, etc., DISAPPEAR in WordPress Marldown # see http://lxml.de/xpathxslt.html print "\n\nExample 5 - xpath tag with class=value" print """htmltree.xpath("//h1[@class='title topictitle1']")[0].text = %s"""%(htmltree.xpath("//h1[@class='title topictitle1']")[0].text) print """htmltree.xpath("//p[@class='shortdesc']")[0].text = %s"""%(htmltree.xpath("//p[@class='shortdesc']")[0].text) print """len(htmltree.xpath("//var[@class='keyword varname']")) = %s"""%(len(htmltree.xpath("//var[@class='keyword varname']"))) print """htmltree.xpath("//var[@class='keyword varname']")[0].text = %s"""%(htmltree.xpath("//var[@class='keyword varname']")[0].text) #Example 6 - parent and ElementVariables with // VS .// print "\n\nExample 6 - parent and ElementVariables" print """syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent()) syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() print """syntax_div = %s"""%(syntax_div) print syntax_div_2string,'\n' print """syntax_div = %s"""%(syntax_div) print "\n\nsyntax_div.xpath // VS .// \n\n" print "// uses htmltree" print """ syntax_div.xpath("count(//samp)") = %s"""%(syntax_div.xpath("count(//samp)")) print ' equals\n' print """ htmltree.xpath("count(//samp)") = %s"""%(htmltree.xpath("count(//samp)")) print """ syntax_div.xpath("count(//var)") = %s"""%(syntax_div.xpath("count(//var)")) print ' equals' print """ htmltree.xpath("count(//var)") = %s"""%(htmltree.xpath("count(//var)")) print '\nVS .// uses syntax_div ONLY' print """ htmltree.xpath("count(.//samp)") = %s"""%(htmltree.xpath("count(.//samp)")) print """ syntax_div.xpath("count(.//var)") = %s"""%(syntax_div.xpath("count(.//var)")) print "\n" print syntax_div_ipython_discovery #Example 7 - xpath select element by text print "\n\nExample 7 - xpath select element by text" print """description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent()) description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() print """description_div = %s"""%(description_div) print description_div_2string,'\n' print """description_div.xpath("./p")[0].text = %s"""%(description_div.xpath("./p")[0].text) #Example 8 - get all text in an element print "\n\nExample 8 - get all text in element\nsee http://lxml.de/lxmlhtml.html#html-element-methods" print """xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent()) example_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() print example_div_2string,'\n' print "example_div.text_content() = %s"%(example_div.text_content()) #Example 9 - zip/dict data terms & data definitions print "\n\nExample 9 - zipping data terms & data definitions" print """options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = %s"""%(htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent()) options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() print options_div_2string terms = [t.text for t in options_div.xpath("dl/dt/samp/var")] defs = [d.text for d in options_div.xpath("dl/dd")] term_def_dict = dict(zip(terms, defs)) print '\nterm_def_dict' for k, v in term_def_dict.iteritems(): print ' %s: %s'%(k,v) #print lxml.etree.tostring(options_div, pretty_print=True) options_div_2string = """<div class="section"> <h2 class="title sectiontitle">Options</h2> <dl class="dl"> <dt class="dt dlterm"> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </dt> <dd class="dd">Specifies the ACL policy that is applied to the named object. The ACL policy must exist, or an error is displayed. <p class="p">Examples of the ACL names are <samp class="ph codeph">default-root</samp>, <samp class="ph codeph">test</samp>, <samp class="ph codeph">default-management</samp>, and <samp class="ph codeph">pubs_acl3</samp>.</p> </dd> <dt class="dt dlterm"> <samp class="ph codeph"> <var class="keyword varname">object_name</var> </samp> </dt> <dd class="dd">Specifies the object to which to apply the named ACL policy. The object name must exist, or an error is displayed. <p class="p">Examples of object names are: </p> <ul class="ul"> <li class="li"> <samp class="ph codeph">/Management/Groups/Travel</samp> </li> <li class="li"> <samp class="ph codeph">/WebSEAL</samp> </li> <li class="li"> <samp class="ph codeph">/Management</samp> </li> </ul> </dd> </dl> </div>""" example_div_2string = """<div class="example"> <h2 class="title sectiontitle">Example</h2> <div class="p">The following example attaches the ACL policy, <samp class="ph codeph">pubs_acl3</samp>, to the protected object, <samp class="ph codeph">/Management</samp>: <pre class="pre codeblock"> <code>pdadmin sec_master> acl attach /Management pubs_acl3</code> </pre> </div> </div> """ description_div_2string = """<div class="section"> <h2 class="title sectiontitle">Syntax</h2> <p class="p"> <span class="keyword cmdname">acl attach</span> <samp class="ph codeph"> <var class="keyword varname">object_name</var></samp> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </p> </div>""" syntax_div_2string = """<div class="section"> <h2 class="title sectiontitle">Syntax</h2> <p class="p"> <span class="keyword cmdname">acl attach</span> <samp class="ph codeph"> <var class="keyword varname">object_name</var> </samp> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </p> </div>""" syntax_div_ipython_discovery = """In [54]: syntax_div. syntax_div.addnext syntax_div.get_element_by_id syntax_div.keys syntax_div.addprevious syntax_div.getchildren syntax_div.label syntax_div.append syntax_div.getiterator syntax_div.make_links_absolut syntax_div.attrib syntax_div.getnext syntax_div.makeelement syntax_div.base syntax_div.getparent syntax_div.nsmap syntax_div.base_url syntax_div.getprevious syntax_div.prefix syntax_div.body syntax_div.getroottree syntax_div.remove syntax_div.clear syntax_div.head syntax_div.replace syntax_div.cssselect syntax_div.index syntax_div.resolve_base_href syntax_div.drop_tag syntax_div.insert syntax_div.rewrite_links syntax_div.drop_tree syntax_div.items syntax_div.set syntax_div.extend syntax_div.iter syntax_div.sourceline syntax_div.find syntax_div.iterancestors syntax_div.tag syntax_div.find_class syntax_div.iterchildren syntax_div.tail syntax_div.find_rel_links syntax_div.iterdescendants syntax_div.text syntax_div.findall syntax_div.iterfind syntax_div.text_content syntax_div.findtext syntax_div.iterlinks syntax_div.values syntax_div.forms syntax_div.itersiblings syntax_div.xpath syntax_div.get syntax_div.itertext """ NUM_ARGS = 2 def main(): args = sys.argv[1:] if len(args) != NUM_ARGS or "-h" in args or "--help" in args: print __doc__ s = raw_input('hit return to quit') sys.exit(2) lxmlScrapingExamples(args[0], args[1]) if __name__ == '__main__': main()
Resulting Output
>lxmlScrapingExamplesMore.py http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt http://joecodeswell.org/examples/dlwebfiles/acl_attach.htm lxmlScrapingOutput.txt Example 1 - basic parsing of url Example 5 - xpath tag with class=value htmltree.xpath("//h1[@class='title topictitle1']")[0].text = acl attach htmltree.xpath("//p[@class='shortdesc']")[0].text = Attaches an ACL policy to a protected object. If the protected object already has an ACL attached, the ACL is replaced with a new one. len(htmltree.xpath("//var[@class='keyword varname']")) = 4 htmltree.xpath("//var[@class='keyword varname']")[0].text = object_name Example 6 - parent and ElementVariables syntax_div = htmltree.xpath("//h2[@class='title sectiontitle']")[0].getparent() = <Element div at 0xb7df00> syntax_div = <Element div at 0xb7df00> <div class="section"> <h2 class="title sectiontitle">Syntax</h2> <p class="p"> <span class="keyword cmdname">acl attach</span> <samp class="ph codeph"> <var class="keyword varname">object_name</var> </samp> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </p> </div> syntax_div = <Element div at 0xb7df00> syntax_div.xpath // VS .// // uses htmltree syntax_div.xpath("count(//samp)") = 14.0 equals htmltree.xpath("count(//samp)") = 14.0 syntax_div.xpath("count(//var)") = 4.0 equals htmltree.xpath("count(//var)") = 4.0 VS .// uses syntax_div ONLY htmltree.xpath("count(.//samp)") = 14.0 syntax_div.xpath("count(.//var)") = 2.0 In [54]: syntax_div. syntax_div.addnext syntax_div.get_element_by_id syntax_div.keys syntax_div.addprevious syntax_div.getchildren syntax_div.label syntax_div.append syntax_div.getiterator syntax_div.make_links_absolut syntax_div.attrib syntax_div.getnext syntax_div.makeelement syntax_div.base syntax_div.getparent syntax_div.nsmap syntax_div.base_url syntax_div.getprevious syntax_div.prefix syntax_div.body syntax_div.getroottree syntax_div.remove syntax_div.clear syntax_div.head syntax_div.replace syntax_div.cssselect syntax_div.index syntax_div.resolve_base_href syntax_div.drop_tag syntax_div.insert syntax_div.rewrite_links syntax_div.drop_tree syntax_div.items syntax_div.set syntax_div.extend syntax_div.iter syntax_div.sourceline syntax_div.find syntax_div.iterancestors syntax_div.tag syntax_div.find_class syntax_div.iterchildren syntax_div.tail syntax_div.find_rel_links syntax_div.iterdescendants syntax_div.text syntax_div.findall syntax_div.iterfind syntax_div.text_content syntax_div.findtext syntax_div.iterlinks syntax_div.values syntax_div.forms syntax_div.itersiblings syntax_div.xpath syntax_div.get syntax_div.itertext Example 7 - xpath select element by text description_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Description']")[0].getparent() = <Element div at 0xd10e40> description_div = <Element div at 0xd10e40> <div class="section"> <h2 class="title sectiontitle">Syntax</h2> <p class="p"> <span class="keyword cmdname">acl attach</span> <samp class="ph codeph"> <var class="keyword varname">object_name</var></samp> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </p> </div> description_div.xpath("./p")[0].text = At most, one ACL can be attached to a given protected object. The same ACL can be attached to multiple protected objects. Ensure that you are familiar with ACL management before you use this function. Example 8 - get all text in element see http://lxml.de/lxmlhtml.html#html-element-methods xample_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Example']")[0].getparent() = <Element div at 0xd10e70> <div class="example"> <h2 class="title sectiontitle">Example</h2> <div class="p">The following example attaches the ACL policy, <samp class="ph codeph">pubs_acl3</samp>, to the protected object, <samp class="ph codeph">/Management</samp>: <pre class="pre codeblock"> <code>pdadmin sec_master> acl attach /Management pubs_acl3</code> </pre> </div> </div> example_div.text_content() = ExampleThe following example attaches the ACL policy, pubs_acl3, to the protected object, /Management: pdadmin sec_master> acl attach /Management pubs_acl3 Example 9 - zipping data terms & data definitions options_div = htmltree.xpath("//h2[@class='title sectiontitle' and text()='Options']")[0].getparent() = <Element div at 0xd10d80> <div class="section"> <h2 class="title sectiontitle">Options</h2> <dl class="dl"> <dt class="dt dlterm"> <samp class="ph codeph"> <var class="keyword varname">acl_name</var> </samp> </dt> <dd class="dd">Specifies the ACL policy that is applied to the named object. The ACL policy must exist, or an error is displayed. <p class="p">Examples of the ACL names are <samp class="ph codeph">default-root</samp>, <samp class="ph codeph">test</samp>, <samp class="ph codeph">default-management</samp>, and <samp class="ph codeph">pubs_acl3</samp>.</p> </dd> <dt class="dt dlterm"> <samp class="ph codeph"> <var class="keyword varname">object_name</var> </samp> </dt> <dd class="dd">Specifies the object to which to apply the named ACL policy. The object name must exist, or an error is displayed. <p class="p">Examples of object names are: </p> <ul class="ul"> <li class="li"> <samp class="ph codeph">/Management/Groups/Travel</samp> </li> <li class="li"> <samp class="ph codeph">/WebSEAL</samp> </li> <li class="li"> <samp class="ph codeph">/Management</samp> </li> </ul> </dd> </dl> </div> term_def_dict object_name: Specifies the object to which to apply the named ACL policy. The object name must exist, or an error is displayed. acl_name: Specifies the ACL policy that is applied to the named object. The ACL policy must exist, or an error is displayed. >