使用Python采集京东数据

桃色几许.

于 2025-05-26 13:58:21 发布

阅读量608

点赞数 14

CC 4.0 BY-SA版权

文章标签： python 开发语言

本文链接：https://blog.csdn.net/2503_92173744/article/details/148228735

采集京东数据需要注意遵守京东的robots.txt规定和相关法律法规。以下是一些合法合规的方法来获取京东数据：

方法一：使用京东官方API（推荐）

京东提供了开放平台API，这是最合规的方式：

python

复制

下载

import requests

# 京东开放平台API示例（需要申请API key）
def get_jd_product_info(product_id, app_key, app_secret):
    url = f"https://api.jd.com/routerjson?method=jd.union.open.goods.query&app_key={app_key}&access_token=&param_json={{\"goodsReq\":{{\"skuIds\":\"{product_id}\"}}}}"
    
    headers = {
        "Content-Type": "application/json"
    }
    
    response = requests.get(url, headers=headers)
    return response.json()

# 使用示例（需要替换为你的app_key和app_secret）
# result = get_jd_product_info("商品ID", "你的APP_KEY", "你的APP_SECRET")
# print(result)

方法二：使用requests和BeautifulSoup进行网页解析

python

复制

下载

import requests
from bs4 import BeautifulSoup
import time
import random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://www.jd.com/'
}

def get_jd_product_info(keyword):
    search_url = f"https://search.jd.com/Search?keyword={keyword}"
    
    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        products = []
        for item in soup.select('.gl-item'):
            product = {
                'title': item.select_one('.p-name em').get_text(strip=True) if item.select_one('.p-name em') else None,
                'price': item.select_one('.p-price strong i').get_text(strip=True) if item.select_one('.p-price strong i') else None,
                'shop': item.select_one('.p-shop a').get_text(strip=True) if item.select_one('.p-shop a') else None,
                'url': item.select_one('.p-img a')['href'] if item.select_one('.p-img a') else None
            }
            if not product['url'].startswith('http'):
                product['url'] = 'https:' + product['url']
            products.append(product)
        
        return products
    
    except Exception as e:
        print(f"Error: {e}")
        return None

# 使用示例
# products = get_jd_product_info("手机")
# for product in products:
#     print(product)

# 注意：适当控制请求频率，避免给京东服务器造成压力
time.sleep(random.uniform(1, 3))

方法三：使用Selenium模拟浏览器

python

复制

下载

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def get_jd_data_with_selenium(keyword):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # 无头模式
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        driver.get(f"https://search.jd.com/Search?keyword={keyword}")
        
        # 等待页面加载
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".gl-item"))
        )
        
        # 滚动页面以加载更多内容
        for i in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        
        products = []
        items = driver.find_elements(By.CSS_SELECTOR, ".gl-item")
        for item in items:
            try:
                product = {
                    'title': item.find_element(By.CSS_SELECTOR, ".p-name em").text,
                    'price': item.find_element(By.CSS_SELECTOR, ".p-price strong i").text,
                    'shop': item.find_element(By.CSS_SELECTOR, ".p-shop a").text if item.find_elements(By.CSS_SELECTOR, ".p-shop a") else None,
                    'url': item.find_element(By.CSS_SELECTOR, ".p-img a").get_attribute("href")
                }
                products.append(product)
            except Exception as e:
                print(f"Error parsing item: {e}")
                continue
        
        return products
    
    finally:
        driver.quit()

# 使用示例
# products = get_jd_data_with_selenium("笔记本电脑")
# for product in products:
#     print(product)