业务部门需要更新最新的全国区划信息数据,建立基础数据库,权威数据当然是国家统计局的官方数据
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/
图片
这里要做的,就是将其爬取下来。环境准备我们使用python工具爬取数据,并将其保存为Excel:
python环境 ,略过;
相关依赖requests、BeautifulSoup、pandas、threading、os;requests 用于web请求,并获取页面数据;BeautifulSoup 提取页面数据;pandas 数据分析,此处仅仅用来方便数据导出;threading 多线程爬取;
代码片段1、定义地址信息对象封装解析后的数据,areainfo
class areainfo(): def __init__(self): self.areacode='' #行政区划编码 self.areaname='' #行政区划名称 self.parentcode='' #父级区划编码 self.leve='' #地址级别 self.href='' #连接地址 def as_dict(self): return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}2、地址解析对象
将整个地址解析方法封装为一个类,包含 web请求、web解析等方法
图片
2.1 获取web信息
#classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等级 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的形式给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list
该处将异常的请求存到err.log文件中,以便于后期读取异常连接,补充丢失数据。
2.2 web信息解析 #classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等级 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的形式给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list网页中,每一层级区划信息的便签不同,可使用浏览器F12进入调试模式识别。BeautifulSoup 通过对标签class提取,获取需要的区划信息数据。
eg
图片
2.3 区划信息提取各等级区划信息提取,分别调用2.2的方法进行解析。每个方法返回地址list
''' 获取一级省份 ''' def getPronvice(self): soup = self.getUrl(self.base) if soup is None : return None provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'}) # 按照字典的形式给attrs参数赋值 provinceList=[] for provincesoup in provincesoups: for k in provincesoup.find_all('a'): province = areainfo() province.href=k['href'] province.areaname= k.get_text() province.areacode= k['href'].replace('.html','0000') province.parentcode='0' province.leve = '1' print(province.__dict__) provinceList.append(province) return provinceList ''' 获取二级城市 ''' def getCity(self,parent): url=self.base + parent.href list =self.initAreainfo(url,'citytr',parent.areacode,'2') return list ''' 获取三级城市 ''' def getCounty(self,parent): url = self.base + parent.href list = self.initAreainfo(url,'countytr',parent.areacode,'3') return list ''' 获取四级地址 ''' def getTown(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.href list = self.initAreainfo(url,'towntr',parent.areacode,'4') return list ''' 获取五级地址 ''' def getVillagetr(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href list = self.initAreainfo(url,'villagetr',parent.areacode,'5') return list
2.4 省份数据封装
获取一个省下边所有地址数据
''' 通过省份获取该省份下所有地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f') writer.save()2.5 线程封装''' 异步调用 ''' def ruanthread(self): provinces = self.getPronvice() for province in provinces: threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()2.6 万能的MAIN if __name__ == '__main__': china_city=china_city() china_city.ruanthread()
2.7 补充-err.log 数据处理
构建新的方法,仅仅解析区划信息。该方法不太完善,仅参考
def getCityOnly(self,url,str,leve): list = self.initAreainfo(url,str,'',leve) return list输出数据def errFileRe(self): listother=[] with open('err.log', 'r') as file: line = file.readline() while line: # isspace()方法判断当该行是空行时,跳过该行 if line.isspace(): line = file.readline() else: ''' 不是空行时,对每一行进行的操作 ''' line = line.replace('\n', '') list = self.getCityOnly(line, 'villagetr', '5') listother.extend(list) line = file.readline() return listother运行
图片
导出数据列表
图片
数据格式图片
err.log日志:图片
完整代码附上完整代码
import requestsfrom bs4 import BeautifulSoupimport pandas as pdimport threadingimport osclass areainfo(): def __init__(self): self.areacode='' #行政区划编码 self.areaname='' #行政区划名称 self.parentcode='' #父级区划编码 self.leve='' #地址级别 self.href='' #连接地址 def as_dict(self): return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}class china_city(): def __init__(self): self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' ''' 获取web信息 ''' def getUrl(self,url): try: headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} resp = requests.get(url, headers=headers) resp.encoding = 'gbk' text = resp.text soup = BeautifulSoup(text, 'html.parser') return soup #记录异常请求 except Exception as e: print(e) with open('err.log', 'a') as file: # ”w'代表着每次运行都覆盖内容 file.write(url + '\n') return None ''' 获取一级省份 ''' def getPronvice(self): soup = self.getUrl(self.base) if soup is None : return None provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'}) # 按照字典的形式给attrs参数赋值 provinceList=[] for provincesoup in provincesoups: for k in provincesoup.find_all('a'): province = areainfo() province.href=k['href'] province.areaname= k.get_text() province.areacode= k['href'].replace('.html','0000') province.parentcode='0' province.leve = '1' print(province.__dict__) provinceList.append(province) return provinceList ''' 获取二级城市 ''' def getCity(self,parent): url=self.base + parent.href list =self.initAreainfo(url,'citytr',parent.areacode,'2') return list ''' 获取三级城市 ''' def getCounty(self,parent): url = self.base + parent.href list = self.initAreainfo(url,'countytr',parent.areacode,'3') return list ''' 获取四级地址 ''' def getTown(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.href list = self.initAreainfo(url,'towntr',parent.areacode,'4') return list ''' 获取五级地址 ''' def getVillagetr(self,parent): url = parent.href if url=='' : return None url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href list = self.initAreainfo(url,'villagetr',parent.areacode,'5') return list ''' soup解析 ''' def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等级 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的形式给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity) return list ''' 通过省份获取该省份下所有地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f') writer.save() ''' 异步调用 ''' def ruanthread(self): provinces = self.getPronvice() for province in provinces: threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()if __name__ == '__main__': china_city=china_city() china_city.ruanthread()本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报。