bs4使用
#基本元素:
1.tag:标签 用<></>表明开头和结尾 标签.
2.name:<p>...</p>
格式:.name
3.attributes 字典形式组织格式:.attrs
4.NavigableString 非属性字符串,<>…</>格式:.string
5.comment 注释部分 特殊的comment类型。
import requests
r= requests.get("https://www.python123.io/index/courses/250")
demo = r.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(demo,"html.parser")
print(soup)
print(soup.prettify())
demo = "https://www.python123.io/index/"
from bs4 import BeautifulSoup
#warning urllib2 在 python3.x 中被改为urllib.request
from urllib.request import urlopen
soup = BeautifulSoup(urlopen(demo),"html.parser")
print(soup.title)
print(soup.a.name)
print(soup.a.parent.name)
print(soup.p.name)
print(soup.p.parent.name)
tag = soup.a
print(tag.attrs)
print(tag.attrs['href'])
print(type(tag.attrs))
print(type(tag))
print("------------------------------------------")
print(soup.a.string)
print(soup.p.string)
基于bs4库的html的内容遍历方法(上行,下行,平行遍历)
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.python123.io/ws/demo.html")
demo = r.text
print(demo)
print("-----------------------------------------------")
.contents 子节点的列表,将所有儿子节点存入列表 warning:list
.children 子节点的迭代类型,与.contents类似,用于循环遍历儿子节点。warning:迭代类型
.descendants子节点的迭代类型,包含所有子孙节点,用于循环遍历 同上’’’
soup = BeautifulSoup(demo,"html.parser")
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
print(len(soup.body.contents))
for child in soup.body.contents:
print(child)
for descendant in soup.body.descendants:
print(descendant)
.parent 节点的父亲标签
.parents 节点先辈标签的迭代类型,用于循环遍历先辈节点
print(soup.title.parent)
print(soup.html.parent)
print("________________________________")
#标签树的上行遍历
soup = BeautifulSoup(demo,"html.parser")
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
#标签树的平行遍历:发生在同一父节点下的各节点间
1).next_sibling 按HTML文本顺序下一个平行节点标签
2).previous_sibling HtML文本顺序的上一个平行节点标签
3).next_sublings 迭代类型 按html文本顺序的后续所有平行节点标签
4).previous_siblings 迭代类型 按html文本顺序的前序所有平行节点标签
print("-------------------")
print(soup.a.next_sibling)
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)
print(soup.a.previous_sibling.previous_subling)
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.prettify())#增加换行符
print(soup.a.prettify())
print("--------------------------------")
soup = BeautifulSoup("<p>中文</p>","html.parser")
print(soup.p.string)
print(soup.p.prettify())