python爬取国家统计局2019年行政区划分数据


建立数据表:
创建tab_citys mysql 数据表


DROP TABLE IF EXISTS `tab_citys`;
CREATE TABLE `tab_citys` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `parent_id` int(11) DEFAULT NULL,
  `city_name_zh` varchar(20) NOT NULL,
  `city_name_en` varchar(20) DEFAULT NULL,
  `city_level` int(11) NOT NULL,
  `city_code` char(12) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;

pyton脚本

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#   功能:  获取省市县数据
#   版本:v1.1
import importlib
import sys
import pymysql
importlib.reload(sys)
import requests
import lxml.etree as etree

import os


class chinese_city():
    # 初始化函数
    def __init__(self):
        self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
        self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
        self.conn = pymysql.connect(host="localhost", port=8889, user="root", passwd="root", db="test", charset='utf8')
        self.cur = self.conn.cursor()
        self.trdic = {
            1: '//tr[@class="provincetr"]',
            2: '//tr[@class="citytr"]',
            3: '//tr[@class="countytr"]',
            4: '//tr[@class="towntr"]',
            5: '//tr[@class="villagetr"]'
        }
    def __del__(self):
        if self.cur:
            self.cur.close()
        if self.conn:
            self.conn.close()

    def crawl_page(self,url):
        ''' 爬行政区划代码公布页 '''
        # print(f"crawling...{url}")
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
                   'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
        i = 0
        while i < 3:
            try:
                html = requests.get(url, headers=headers, timeout=20)
                html.encoding = 'gbk'  # 这里添加一行
                # print(html.status_code)
                text = html.text
                return text
            except requests.exceptions.RequestException:
                i += 1
                print('超时'+url)

    #解析省页,返回list
    def parseProvince(self):
        html = self.crawl_page(self.baseUrl)
        tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
        nodes = tree.xpath('//tr[@class="provincetr"]')
        id = 1
        values = []
        for node in nodes:
            items = node.xpath('./td')
            for item in items:
                value = {}
                nexturl = item.xpath('./a/@href')
                province = item.xpath('./a/text()')
                print(province)
                value['url'] = self.base + "".join(nexturl)
                value['name'] = "".join(province)
                value['code'] = 0
                value['pid'] = 0
                value['id'] = id
                value['level'] = 1
                print(repr(value['name']))
                id = id + 1
                last_id = self.insert_to_db(value)
                value['id'] = last_id
                values.append(value)
                print(value)
        return values

    #根据trid 解析子页
    def parse(self,trid, pid, url):
        if url.strip() == '':
            return None
        # url_prefix+url
        html = self.crawl_page(url)
        tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
        
        if trid==3:
            nodes = tree.xpath(self.trdic.get(trid))
            if len(nodes)==0:
                nodes = tree.xpath(self.trdic.get(4))
                print('有镇的市:'+url)
        else:
            nodes = tree.xpath(self.trdic.get(trid))


        path = os.path.basename(url)
        base_url = url.replace(path, '')
        id = 1
        values = []
        # 多个城市
        for node in nodes:
            value = {}
            nexturl = node.xpath('./td[1]/a/@href')
            if len(nexturl) == 0:
                nexturl = ''
            code = node.xpath('./td[1]/a/text()')
            if len(code) == 0:
                code = node.xpath('./td[1]/text()')
            name = node.xpath('./td[2]/a/text()')
            if len(name) == 0:
                name = node.xpath('./td[2]/text()')
            value['code'] = "".join(code)
            urltemp = "".join(nexturl)
            if len(urltemp) != 0:
                value['url'] = base_url + "".join(nexturl)
            else:
                value['url'] = ''
            value['name'] = "".join(name)
            print(repr(value['name']))
            print(value['url'])
            value['id'] = id
            value['pid'] = pid
            value['level'] = trid
            id = id + 1
            last_id = self.insert_to_db(value)
            value['id'] = last_id
            values.append(value)
            print(value)
        return values

    #解析社区页
    def parseVillager(self,trid, pid, url):
        html = self.crawl_page(url)
        tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
        nodes = tree.xpath(self.trdic.get(trid))
        id = 1
        values = []
        # 多个城市
        for node in nodes:
            value = {}
            nexturl = node.xpath('./td[1]/a/@href')
            code = node.xpath('./td[1]/text()')
            vcode = node.xpath('./td[2]/text()')
            name = node.xpath('./td[3]/text()')
            value['code'] = "".join(code)
            value['url'] = "".join(nexturl)
            value['name'] = "".join(name)
            print(repr(value['name']))
            value['id'] = id
            value['pid'] = pid
            value['level'] = trid
            values.append(value)
            id = id + 1
            last_id = self.insert_to_db(value)
            value['id'] = last_id
            values.append(value)
            print(value)

        return values

    #插入数据库
    def insert_to_db(self,taobao):
        # return 0
        param = []
        lastid = 0
        try:
            sql = 'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)'
            param = (0, taobao.get("pid"), taobao.get("name"), '', taobao.get("level"), taobao.get("code"))
            self.cur.execute(sql, param)
            lastid = self.cur.lastrowid
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return lastid

    #从头执行解析
    def parseChineseCity(self):
        values = self.parseProvince()
        for value in values:
            citys = self.parse(2, value['id'], value['url'])
            if not citys is None:
                for city in citys:
                    countys = self.parse(3, city['id'], city['url'])
                    #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了
                    if not countys is None:
                        for county in countys:
                            towns = self.parse(4, county['id'], county['url'])
                            if towns is not None:
                                for town in towns:
                                    villagers = self.parseVillager(5, town['id'], town['url'])

if __name__ == '__main__':
    chinese_city = chinese_city()
    chinese_city.parseChineseCity()

 

Python爬取国家统计局的人口数据通常涉及网络爬虫技术,可以使用requests库获取网页内容,然后使用BeautifulSoup或lxml等解析库来解析HTML,提取需要的数据。以下是一个简化的步骤: 1. **安装必要的库**: 首先确保已安装`requests`, `beautifulsoup4`, 可能还需要`lxml`,如果还未安装,可以运行: ``` pip install requests beautifulsoup4 lxml ``` 2. **确定目标网址**: 国家统计局的官网可能会有API接口提供数据,如果没有,你需要找到包含人口数据的网页链接。 3. **发送HTTP请求**: 使用requests.get()函数发送GET请求,获取网页源代码: ```python url = "https://example.gov/statistics" # 替换为你找到的URL response = requests.get(url) ``` 4. **解析HTML**: 使用BeautifulSoup解析响应的内容: ```python soup = BeautifulSoup(response.text, 'lxml') population_data = soup.find_all('div', class_='population-data') # 示例,查找特定CSS类 ``` 5. **提取数据**: 根据网页结构,提取关键信息如数字、表格或JSON数据: ```python data_elements = [element.text for element in population_data] total_population = data_elements[0] # 假设总人口数据在第一个元素 ``` 6. **保存数据**: 将提取到的数据存储到文件或数据库中。 请注意,实际操作时需要检查网站的robots.txt文件以及是否允许爬取,遵守相关规定。另外,频繁抓取可能会被封IP,所以通常推荐设置延迟或使用代理IP。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值