国内共享的数据集不好找,即使有也是想着法子变相收费或拉粉的,其实全国省市区镇街五级行政区划这个数据集已经找到了,为了某个目的还是用爬虫去实现了一下,一方面为了鄙视那些收费的人,一方面是为了锻炼一下自己的编程能力,另一方面是为了著书的目的。
首先是要明确目标才去找数据集。
其次是尽量找现成的数据集,找不到再找数据源。
再次拿到数据源以后,对数据源做分析。
1、我们要找的数据集,也就是结构化数据,和爬虫还是略有不同
2、分析数据源的入口和url规则
3、分析网页结构,获取自己想要的数据以及如何构造下一轮爬虫
第一级、省市自治区直辖市,包括名称和url链接 13.html http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{url}
第二级、地级市,包括区划代码、名称和url链接 13/1301.html http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{url}
第三级、区县工业园等,包括区划代码、名称和url链接 01/130102.html http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{provicecode}/{url}
第四级、街道和镇,包括区划代码、名称和url链接 02/130102001.html http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{provicecode}{districtcode}{url}
第五级、居民委员会或者村委会,包括区划代码、城乡分类代码和名称
明白这个目的之后,实现起来就不会太困难。
爬取数据的难题主要在于不停的试错,你无法测试所有的情况,也无法预料对端对爬取数据可能采取的动作。

第一步、定义爬虫的重试机制
这个代码是抄过来的,不过做了一些简单改动,重试时间和输出更加直观一些
from lxml import etree
import requests
import time
import csv
def retry(times):
def wrapper(func):
def inner_wrapper(*args, **kwargs):
i = 0
while i < times:
try:
return func(*args, **kwargs)
except:
# 此处打印日志 func.__name__ 为错误的函数名
print("log debug: {}()".format(func.__name__))
# 错误次数加一
i += 1
# 每次根据错误的次数调整休眠时间
time.sleep(3 * i)
# 输出当前是第几次重试爬虫
print('第{}次重试爬虫'.format(i))
return inner_wrapper
return wrapper
第二步,从根节点开始爬取数据,获取各省的数据和链接
from lxml import etree
import requests
import time
# 第一级、省市自治区直辖市,包括名称和url链接
# 第二级、地级市,包括区划代码、名称和url链接
# 第三级、区县工业园等,包括区划代码、名称和url链接
# 第四级、街道和镇,包括区划代码、名称和url链接
# 第五级、居民委员会或者村委会,包括区划代码、城乡分类代码和名称
url_level0 = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html'
base_url_level = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{}'
base_url_level2 = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{}/{}'
base_url_level3 = 'http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/{}/{}/{}'
@retry(5)
def getlevelroot(url):
print('处理根页面 {}--------------------'.format(url))
provincetres = requests.get(url=url, headers=headers, verify=False, timeout=3)
provincehtml = provincetres.content.decode("utf-8", "ignore")
provincetree = etree.HTML(provincehtml)
provincetrs = provincetree.xpath('//tr[@class="provincetr"]')
for provincetr in provincetrs:
provincetd = provincetr.xpath('td')
for province in provincetd:
provinceurl = base_url_level.format(province.xpath('a/@href')[0])
provinceurlcode = province.xpath('a/@href')[0].split('.')[0]
provincename = province.xpath('a/text()')[0]
print('当前省市自治区直辖市 = ', provincename, provinceurl)
with open(r'省份.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([provinceurlcode.ljust(12, '0'), provincename])
getlevel1(provinceurl, provinceurlcode)
第三步,爬取各省链接,获取地级市的信息和链接
@retry(5)
def getlevel1(url, provinceurlcode):
print('处理省市自治区直辖市页面 {}--------------------'.format(url))
citytres = requests.get(url=url, headers=headers, verify=False, timeout=3)
cityhtml = citytres.content.decode("utf-8", "ignore")
citytree = etree.HTML(cityhtml)
citytrs = citytree.xpath('//tr[@class="citytr"]')
for city in citytrs:
cityurl = base_url_level.format(city.xpath('td/a/@href')[0])
citycode = city.xpath('td/a/text()')[0]
cityname = city.xpath('td/a/text()')[1]
print('当前地级市 = ', cityurl, citycode, cityname)
with open(r'地级市.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([citycode, cityname])
getlevel2(cityurl, provinceurlcode)
第四步,爬取地级市链接,获取县区的信息和链接
@retry(5)
def getlevel2(url='', provinceurlcode='13'):
# url='http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/13/1301.html'
print('处理地级市页面 {}--------------------'.format(url))
countytres = requests.get(url=url, headers=headers, verify=False, timeout=3)
countyhtml = countytres.content.decode("utf-8", "ignore")
countytree = etree.HTML(countyhtml)
countytrs = countytree.xpath('//tr[@class="countytr"]')
for county in countytrs:
try:
newurl = county.xpath('td/a/@href')[0]
countyurl = base_url_level2.format(provinceurlcode, newurl)
countyurlcode = newurl.split('/')[0]
countycode = county.xpath('td/a/text()')[0]
countyname = county.xpath('td/a/text()')[1]
print('当前县区 = ', countyurl, countycode, countyname)
with open(r'县区.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([countycode, countyname])
getlevel3(countyurl, provinceurlcode, countyurlcode)
except IndexError:
countycode = county.xpath('td/text()')[0]
countyname = county.xpath('td/text()')[1]
print(countycode, countyname)
第五步,爬取县区链接,获取镇街的信息和链接
@retry(5)
def getlevel3(url='', provinceurlcode='13', countyurlcode='01'):
# url='http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/13/01/130102.html'
print('处理县区页面 {}--------------------'.format(url))
towntres = requests.get(url=url, headers=headers, verify=False, timeout=3)
townhtml = towntres.content.decode("utf-8", "ignore")
towntree = etree.HTML(townhtml)
towntrs = towntree.xpath('//tr[@class="towntr"]')
for town in towntrs:
try:
newurl = town.xpath('td/a/@href')[0]
townurl = base_url_level3.format(provinceurlcode, countyurlcode, newurl)
towncode = town.xpath('td/a/text()')[0]
townname = town.xpath('td/a/text()')[1]
print('当前镇街 = ', newurl, townurl, towncode, townname)
with open(r'镇街.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([towncode, townname])
getlevel4(townurl)
except IndexError:
towncode = town.xpath('td/text()')[0]
townname = town.xpath('td/text()')[1]
print(towncode, townname)
第六步,爬取镇街链接,获取社区的信息
@retry(5)
def getlevel4(url=''):
# url='http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/13/01/02/130102001.html'
print('处理镇街页面 {}--------------------'.format(url))
villagetres = requests.get(url=url, headers=headers, verify=False, timeout=3)
villagehtml = villagetres.content.decode("utf-8", "ignore")
villagetree = etree.HTML(villagehtml)
villagetrs = villagetree.xpath('//tr[@class="villagetr"]')
for village in villagetrs:
villagecode = village.xpath('td/text()')[0]
villagetype = village.xpath('td/text()')[1]
villagename = village.xpath('td/text()')[2]
global a
a = a + 1
print(a, villagecode, villagetype, villagename)
with open(r'社区.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow([villagecode, villagetype, villagename])
第七步,开始执行
if __name__ == '__main__':
a = 1
headers = {'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
getlevelroot(url_level0)
以下是爬取的相关信息窗口
处理根页面 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html--------------------
当前省市自治区直辖市 = 北京市 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11.html
处理省市自治区直辖市页面 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11.html--------------------
当前地级市 = http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/1101.html 110100000000 市辖区
处理地级市页面 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/1101.html--------------------
当前县区 = http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/01/110101.html 110101000000 东城区
处理县区页面 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/01/110101.html--------------------
当前镇街 = 01/110101001.html http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/01/01/110101001.html 110101001000 东华门街道
处理镇街页面 http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/11/01/01/110101001.html--------------------
2 110101001001 111 多福巷社区居委会
3 110101001002 111 银闸社区居委会
4 110101001005 111 东厂社区居委会
5 110101001006 111 智德社区居委会
最后欢迎关注公众号:python与大数据分析





