游客状态在国家科技图书文献中心翻页到第五页会强制登陆,直接点击第五页也会弹出登陆界面,而且拿不到翻页后的url,本文采用更改排序方式获取论文关键信息,使用火狐浏览器的geckodriver用于selenium爬取
思路分析
我们打开网页:nstl学位论文
翻页后发现最上面url不变,而且游客状态最多只能翻4页,想通过直接点第5页或者点第4页后再点击下一页都会弹出登陆界面。现在我想获取40篇以上论文关键信息,又不想登陆,我们可以看到右上角有个排序,现在是默认排序,我们点击后,更改排序
现在论文页面就变了,按F12查看页面,我们可以发现每篇url规律
都是一个固定的url前缀加上不同id:
https://www.nstl.gov.cn/paper_detail.html?id=跟上论文id
现在通过爬取一定数量url后采用selenium依次获取单个url的以下内容:
论文标题、作者、机构、院校、专业、学位、授予机构、导师、语种、提交日期、论文答辩日期、分类号、关键词、摘要
爬取默认排序下的url
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd # 导入pandas库用于处理Excel文件
def fetch_papers_info(pages: int):
# 初始化 WebDriver
driver_path = r'D:\Users\Admin\Downloads\geckodriver.exe'
driver = webdriver.Firefox(executable_path=driver_path)
# 打开初始网页
driver.get("https://www.nstl.gov.cn/resources_search.html?t=DegreePaper&q=5Lit5Zu956eR5a2m6Zmi5aSn5a2m")
# 等待页面加载
time.sleep(5)
# 点击“中国科学院”的按钮
try:
# 找到2022年论文按钮的元素并点击
sen_button = driver.find_element(By.XPATH,
'/html/body/div[4]/div/div[1]/ul[1]/li/ul/li[1]/a')
sen_button.click()
print("已点击中国科学院论文按钮")
except Exception as e:
print("点击按钮时发生错误:", e)
# 等待新的页面加载
time.sleep(5)
# 点击“2022年论文”的按钮
try:
# 找到2022年论文按钮的元素并点击
year_2022_button = driver.find_element(By.XPATH,
'/html/body/div[4]/div/div[1]/ul[1]/li/ul/li[2]/a')
year_2022_button.click()
print("已点击2022年论文按钮")
except Exception as e:
print("点击按钮时发生错误:", e)
# 等待新的页面加载
time.sleep(5)
# 存储文章信息
papers_info = []
for page in range(pages):
# 获取当前页面中所有文章的标题和跳转 ID
articles = driver.find_elements(By.CSS_SELECTOR, "p.result-list-tit")
for article in articles:
article_id = article.get_attribute("data-id")
full_url = f"https://www.nstl.gov.cn/paper_detail.html?id={article_id}"
papers_info.append(full_url) # 仅存储 URL
# 翻页
try:
next_page_button = driver.find_element(By.CSS_SELECTOR, "a.layui-laypage-next")
next_page_button.click()
time.sleep(3) # 等待新内容加载
except Exception as e:
print("翻页时发生错误:", e)
break
# 关闭 WebDriver
driver.quit()
# 将结果保存到 Excel 文件
save_to_excel(papers_info)
def save_to_excel(data):
# 创建一个DataFrame
df = pd.DataFrame(data, columns=["urls"])
# 保存为Excel文件
output_file = 'url_first.xlsx'
df.to_excel(output_file, index=False)
print(f"结果已保存到 {output_file}")
# 主程序入口
if __name__ == "__main__":
pages_to_fetch = 4 # 需要翻页的次数
fetch_papers_info(pages_to_fetch)
这个网页要注意一下,点击一个标签,比如“2022年“,点击之后注意xpath路径的变化 ,代码把驱动换成自己浏览器的!
爬取时间排序下的url
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd # 导入pandas库用于处理Excel文件
def fetch_papers_info(pages: int):
# 初始化 WebDriver
driver_path = r'D:\Users\Admin\Downloads\geckodriver.exe'
driver = webdriver.Firefox(executable_path=driver_path)
# 打开初始网页
driver.get("https://www.nstl.gov.cn/resources_search.html?t=DegreePaper&q=5Lit5Zu956eR5a2m6Zmi5aSn5a2m")
# 等待页面加载
time.sleep(5)
# 点击“中国科学院”的按钮
try:
# 找到2022年论文按钮的元素并点击
sen_button = driver.find_element(By.XPATH,
'/html/body/div[4]/div/div[1]/ul[1]/li/ul/li[1]/a')
sen_button.click()
print("已点击中国科学院论文按钮")
except Exception as e:
print("点击按钮时发生错误:", e)
# 等待新的页面加载
time.sleep(5)
# 点击“2022年论文”的按钮
try:
# 找到2022年论文按钮的元素并点击
year_2022_button = driver.find_element(By.XPATH,
'/html/body/div[4]/div/div[1]/ul[1]/li/ul/li[2]/a')
year_2022_button.click()
print("已点击2022年论文按钮")
except Exception as e:
print("点击按钮时发生错误:", e)
# 等待新的页面加载
time.sleep(5)
# 等待页面加载并点击“默认排序”按钮
try:
default_sort_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(
(By.XPATH, "/html/body/div[4]/div/div[2]/div[2]/div[4]/div[2]/div/div[1]/div/div/div[1]/span")))
default_sort_button.click() # 点击“默认排序”
time.sleep(1) # 等待点击响应
except Exception as e:
print("点击默认排序按钮时发生错误:", e)
time.sleep(5)
# 等待时间排序按钮出现并点击
try:
# 首先,找到“时间排序”按钮并点击
time_asc_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.XPATH, "/html/body/div[4]/div/div[2]/div[2]/div[4]/div[2]/div/div[1]/div/div/div[2]/div/a[4]"))
)
time_asc_button.click() # 点击“时间排序”按钮
# 等待点击后的变化,确认新元素是否出现
new_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "span.selectValNext"))
)
print("新元素已成功找到:", new_element.text) # 输出新元素的文本内容
except Exception as e:
print("操作失败:", e)
time.sleep(5)
# 存储文章信息
papers_info = []
for page in range(pages):
# 获取当前页面中所有文章的标题和跳转 ID
articles = driver.find_elements(By.CSS_SELECTOR, "p.result-list-tit")
for article in articles:
article_id = article.get_attribute("data-id")
full_url = f"https://www.nstl.gov.cn/paper_detail.html?id={article_id}"
papers_info.append(full_url) # 仅存储 URL
# 翻页
try:
next_page_button = driver.find_element(By.CSS_SELECTOR, "a.layui-laypage-next")
next_page_button.click()
time.sleep(3) # 等待新内容加载
except Exception as e:
print("翻页时发生错误:", e)
break
# 关闭 WebDriver
driver.quit()
# 将结果保存到 Excel 文件
save_to_excel(papers_info)
def save_to_excel(data):
# 创建一个DataFrame
df = pd.DataFrame(data, columns=["urls"])
# 保存为Excel文件
output_file = 'url_second.xlsx'
df.to_excel(output_file, index=False)
print(f"结果已保存到 {output_file}")
# 主程序入口
if __name__ == "__main__":
pages_to_fetch = 2 # 需要翻页的次数
fetch_papers_info(pages_to_fetch)
合并两个excel表格,去除重复url
import pandas as pd
# 读取 Excel 文件
url1 = pd.read_excel('url1.xlsx')
url2 = pd.read_excel('url2.xlsx')
# 假设每个 DataFrame 中的 URL 存在某一列,以下示例假设列名为 'url'
# 如果列名不同,请根据实际情况修改
urls1 = set(url1['urls']) # 提取 url1 的 URL 并转换为集合
urls2 = set(url2['urls']) # 提取 url2 的 URL 并转换为集合
# 合并 URL 并去重
combined_urls = urls1.union(urls2) # 使用 union 以确保合并去重
# 打印合并后的结果
print(f"合并后的 URL 数量: {len(combined_urls)}")
print("合并后的 URL 列表:")
print(combined_urls)
# 可选:将合并后的 URL 转换为 DataFrame 并保存为 Excel
combined_df = pd.DataFrame(combined_urls, columns=['url'])
combined_df.to_excel('combined_urls.xlsx', index=False)
根据url爬取相应学位论文关键信息
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
# 定义提取机构函数
def extract_authority(driver):
authority = []
# 循环从 1 到 3 提取机构
for i in range(1, 4):
try:
# 构建动态 XPath
xpath = f"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[2]/div/div/span[{i}]/a"
institution_element = driver.find_element(By.XPATH, xpath)
authority.append(institution_element.text.strip())
except Exception as e:
# 如果找不到元素,说明没有更多的授予机构
break
# 使用 '|' 连接关键词
return '|'.join(authority)
# 定义提取关键词函数
def extract_keyword(driver):
keywords = []
# 循环从 1 到 5 提取关键词
for i in range(1, 6):
try:
# 尝试构建并查找第一个 XPath
xpath1 = f"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[11]/div/span[{i}]/a"
keyword_element = driver.find_element(By.XPATH, xpath1)
keywords.append(keyword_element.text.strip())
except Exception:
try:
# 如果第一个 XPath 找不到,尝试第二个 XPath
xpath2 = f"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[12]/div/span[{i}]/a"
keyword_element = driver.find_element(By.XPATH, xpath2)
keywords.append(keyword_element.text.strip())
except Exception:
# 如果第二个 XPath 也找不到,说明没有更多的授予机构,跳出循环
break
# 使用 '|' 连接关键词
return '|'.join(keywords)
# 定义提取展开摘要函数
def extract_expanded_content(driver):
try:
# 等待展开按钮可点击,并点击它
expand_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.btn_open_up'))
)
expand_button.click()
# 等待展开后的内容可见
try:
# 尝试提取第一个可能的 XPath
expanded_content = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[12]/div[1]/p/i'))
)
except Exception:
# 如果第一个 XPath 失败,尝试第二个 XPath
expanded_content = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[13]/div[1]/p/i'))
)
# 提取展开后的内容
content_text = expanded_content.text.strip() # 去除前后空白
return content_text # 返回提取的内容
except Exception as e:
print(f"出现错误: {e}")
return None # 如果出错,返回None
# 使用XPath提取所需信息
def extract_paper_info(driver):
paper_info = {}
paper_info['论文标题'] = driver.find_element(By.XPATH, "//*[@id='title']").text.strip()
# 提取作者
paper_info['作者'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[1]/div/div/span/a[1]").text.strip()
# 提取机构
paper_info['机构'] = extract_authority(driver)
# 提取院校
paper_info['院校'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[3]/div/span/a").text.strip()
# 提取专业
paper_info['专业'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[4]/div/span/a").text.strip()
# 提取学位
paper_info['学位'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[5]/div/span/a").text.strip()
# 提取授予机构
paper_info['授予机构'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[6]/div/span/a").text.strip()
# 提取导师
paper_info['导师'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[7]/div/div/span/a[1]").text.strip()
# 提取语种
paper_info['语种'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[8]/div/span/i").text.strip()
# 提取提交日期
paper_info['提交日期'] = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[9]/div/span/i").text.strip()
# 提取论文答辩日期(有些没有会跟分类号一样)
date_text = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[10]/div/span/i").text.strip()
# 检查文本是否包含中文和英文
if re.search(r'[\u4e00-\u9fff]', date_text) or re.search(r'[a-zA-Z]', date_text):
paper_info['论文答辩日期'] = None
else:
paper_info['论文答辩日期'] = date_text
# 尝试提取分类号
try:
classification_number = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[10]/div/span/i").text.strip()
except NoSuchElementException:
classification_number = None
# 如果分类号和论文答辩日期相同,则尝试从第二个 XPath 提取
if classification_number == paper_info['论文答辩日期']:
try:
classification_number = driver.find_element(By.XPATH,
"/html/body/div[6]/div[1]/div[1]/div[1]/div/div[1]/div[11]/div/span/i").text.strip()
except NoSuchElementException:
classification_number = None
# 最终将分类号存入 paper_info 字典
paper_info['分类号'] = classification_number
# 提取关键词
paper_info['关键词'] = extract_keyword(driver)
# 提取摘要
paper_info['摘要'] = extract_expanded_content(driver)
return paper_info
# 定义主函数
def main(urls):
# 初始化webdriver
driver_path = r'D:\Users\Admin\Downloads\geckodriver.exe' # GeckoDriver路径
driver = webdriver.Firefox(executable_path=driver_path)
all_paper_info = []
for url in urls:
driver.get(url)
time.sleep(5) # 等待页面加载
# 调用提取函数
paper_info = extract_paper_info(driver)
# 将提取到的信息添加到列表中
if paper_info:
all_paper_info.append(paper_info)
else:
all_paper_info.append({'论文答辩日期': None}) # 如果未能提取到信息,添加空字典
# 关闭webdriver
driver.quit()
# 保存结果到Excel
df = pd.DataFrame(all_paper_info)
df.to_excel('中国科学院大学论文信息的.xlsx', index=False) # 保存为 Excel 文件
# 主程序入口
if __name__ == "__main__":
# urls_df = pd.read_excel('combined_urls.xlsx')
# urls = []
# for url in urls_df['url']:
# urls.append(url)
urls_df = pd.read_excel('url_second.xlsx')
urls = []
for url in urls_df['urls'].head(20).tolist():
urls.append(url)
main(urls)
还好是学位论文,要是团体研究成果,几个作者那种,后面很多信息直接是没有的,这边分类号竟然有两类(我发现的就只有两类),一次性爬的多好像会触发它的反爬机制,所以我代码倒数第三行用head一次少爬点。