环境说明
python3.5
centos7.2
重要代码解释
使用selenium加载网页:
driver=webdriver.PhantomJS()
driver.get("https://movie.douban.com/")
使用selenium和web进行互动将网页加在完全:
end = True
while (end):
try:
end = driver.find_element_by_class_name("more")
end.click()
except Exception as e:
print("没有这样的text.")
end = False
获得电影信息的web的源代码:
movis = driver.page_source
driver.close()
使用xpath解析web代码:
html = etree.HTML(movis)
titles = html.xpath("//a[@class='item']")
提取需要的内容:
i =0
while(i<len(titles)):
url_img = titles[i].xpath("./div/img/@src")
title_moive = titles[i].xpath("./p/text()")
rank_movie = titles[i].xpath("./p/strong/text()")
title_moive=re.sub("\s+","",title_moive[0])
i= i+1
完整代码
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from scrapy.selector import Selector
from lxml import etree
import re
driver=webdriver.PhantomJS()
driver.get("https://movie.douban.com/")
end = True
while (end):
try:
end = driver.find_element_by_class_name("more")
end.click()
except Exception as e:
print("没有这样的text.")
end = False
movis = driver.page_source
driver.close()
print(type(movis))
html = etree.HTML(movis)
titles = html.xpath("//a[@class='item']")
i =0
while(i<len(titles)):
url_img = titles[i].xpath("./div/img/@src")
title_moive = titles[i].xpath("./p/text()")
rank_movie = titles[i].xpath("./p/strong/text()")
title_moive=re.sub("\s+","",title_moive[0])
i= i+1
print(url_img,"===",title_moive,"===",rank_movie)
print("****************************************************************************")