"""
思路:
1、先打开浏览器,输入关键字,点击搜索,获取商品页总页数
2、通过遍历所有页面,获取商品页
3、获取页面的时候同时进行解析页面内容
4、将获取到的数据,存入mongodb中
技巧:
1、先通过chrome测试需要的内容,再修改为phatomjs
2、每次需要模拟操作之前,可以设置等待条件,等待加载完毕再操作
3、通过浏览器自带的路径选择器,可以较快的对网页元素进行选择
"""
代码实现
import re
import pymongo
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
client = pymongo.MongoClient("localhost")
db = client["taobao"]
service_args = ["--load-images=false"]
browser = webdriver.PhantomJS(service_args=service_args)
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)
def search_page():
print("正在搜索...")
try:
browser.get("https://www.taobao.com/")
search = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,
'#J_TSearchForm > div.search-button > button'))
)
search.send_keys("美食")
submit.click()
total = browser.find_element_by_css_selector(
"#mainsrp-pager > div > div > div > div.total")
total = int(re.compile("(\d+)").search(total.text).group(1))
return total
except TimeoutException:
return search_page()
def next_page(page_num):
print("正在翻页...", page_num)
try:
number = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,
"#mainsrp-pager > div > div > div > div.form > input")))
confirm = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,
"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
number.clear()
number.send_keys(page_num)
confirm.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
"#mainsrp-pager > div > div > div > ul > li.item.active"), str(page_num)))
parse_page()
except TimeoutException:
next_page(page_num)
def parse_page():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
"#mainsrp-itemlist .items .item")))
doc = pq(browser.page_source)
items = doc("#mainsrp-itemlist .items .item").items()
for item in items:
product = {}
product["image"] = item.find(".pic .img").attr("src")
product["title"] = item.find(".title").text()
product["price"] = item.find(".price").text()
product["shop"] = item.find(".shop").text()
product["deal-cnt"] = item.find(".deal-cnt").text()[:-3]
product["location"] = item.find(".location").text()
print(product)
save_to_mongo(product)
def save_to_mongo(data):
try:
db["taobao"].insert(data)
print("保存成功", data)
except Exception:
print("保存失败")
def main():
total = search_page()
for i in range(1, total+1):
next_page(i)
browser.close()
if __name__ == "__main__":
main()