from lxml import etree
# xpath("//*") 获取所有子孙节点# xpath("//body//") 获取body节点的所有子孙节点# xpath("/html/*") html节点下的子节点# xpath("//body/p[@class='title']") body节点下的p节点中属性中class = 'title'的节点#获取节点属性# xpath("//body/p/@name") body节点下的p节点中属性中name的值,如果有多个都会返回# xpath("//body/p[@class='title']/..") body节点下的p节点中属性中class = 'title'节点的父节点#文本获取 需要到节点标签内获取#xpath("//body/p[@class='title']/b/text()")#属性值多匹配,当某个节点的属性值有多个而无法区分时,可以传入参数进行选择使用contains(@参数名,参数值)#多属性匹配,中间用and连接#xpath("//body/p[@class='title'and @name = 'dromouse']/b/text()")#这里 and 其实是运算符,还有许多运算符:# or 或 and且 //b | //cd 返回包含b和cd的元素节点集#xpath("//body/p[@class='title'or @class = 'story']//text()")#xpath("//head//text() | //body/p[@class='title']//text()")#当匹配到多个节点时,可以用下标选择,因为结果是列表,不过有点区别,下标是从1开始而不是常用的0# result = html.xpath("//body/p[1]")# result1 = html.xpath("//body/p[2]")# result2 = html.xpath("//body/p[3]")# print(result,result1,result2)#节点轴的使用#ancestor轴,可以获取所有祖先节点后面需要两个冒号,再后面是选择器# xpath("//p[@class = 'title']/ancestor::*")#attribute轴,可以获取所有属性值后面需要两个冒号,再后面是选择器# xpath("//p[@class = 'title']/attribute::*")#child轴,可以获取所有直接子节点后面需要两个冒号,再后面是选择器# xpath("//p[@class = 'title']/child::b")#descendant轴,可以获取所有子孙节点后面需要两个冒号,再后面是选择器# xpath("//p[@class = 'story']/descendant::*")# text = """# <html><head><title>The Dormouse's story</title></head># <body># <p class="title" name="dromouse"><b>The Dormouse's story</b></p># <p class="story">Once upon a time there were three little sisters; and their names were# <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;# and they lived at the bottom of a well.</p># <p class="story">...</p># """# html = etree.HTML(text)# result = etree.tostring(html)# with open("./test.html","wb")as f :# f.write(result)"""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
"""
from lxml import etree
html = etree.parse("./test.html",etree.HTMLParser())
result =html.xpath("//p[@class = 'story']/descendant::*")
print(result)