富联娱乐
  • 首页
  • 关于富联娱乐
  • 业务范围
  • 最新动态
  • 联系我们
  • 关于富联娱乐你的位置:富联娱乐 > 关于富联娱乐 > Python 爬虫 中国行政区划信息爬取
    Python 爬虫 中国行政区划信息爬取
    发布日期:2024-08-23 20:04    点击次数:153

    前言

    业务部门需要更新最新的全国区划信息数据,建立基础数据库,权威数据当然是国家统计局的官方数据

    http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/

    图片

    这里要做的,就是将其爬取下来。环境准备

    我们使用python工具爬取数据,并将其保存为Excel:

    python环境 ,略过;

    相关依赖requests、BeautifulSoup、pandas、threading、os;requests 用于web请求,并获取页面数据;BeautifulSoup 提取页面数据;pandas 数据分析,此处仅仅用来方便数据导出;threading 多线程爬取;

    代码片段1、定义地址信息对象

    封装解析后的数据,areainfo

    class areainfo(): def __init__(self): self.areacode='' #行政区划编码 self.areaname='' #行政区划名称 self.parentcode='' #父级区划编码 self.leve='' #地址级别 self.href='' #连接地址 def as_dict(self):      return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}

    2、地址解析对象

    将整个地址解析方法封装为一个类,包含 web请求、web解析等方法

    图片

    2.1 获取web信息

          #classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级     def initAreainfo(self,url,classname,parnetcode,leve):     print( '页面便签 %s -- 地址等级 %s --- url  %s  \n' % (classname,leve,url))     soup = self.getUrl(url)     if soup is None:         return  None     classes = soup.find_all(name='tr', attrs={'class': classname})  # 按照字典的形式给attrs参数赋值     list = []     for classesoup in classes:         group = classesoup.find_all('a')         entity = areainfo()         entity.leve = leve         entity.parentcode = parnetcode         if len(group) > 0:             entity.href = group[0]['href']             entity.areacode = group[0].string             entity.areaname = group[1].string         else:             tds = classesoup.find_all('td')             entity.href = ''             if len(tds)==2 :                 entity.areacode = tds[0].string                 entity.areaname = tds[1].string             if len(tds)==3:                 entity.areacode = tds[0].string                 entity.areaname = tds[2].string                 entity.parentcode = parnetcode         list.append(entity)     return list

    该处将异常的请求存到err.log文件中,以便于后期读取异常连接,补充丢失数据。

    2.2 web信息解析 #classname 页面便签 ,parnetcode 父级区划编码,leve 当前区划等级 def initAreainfo(self,url,classname,parnetcode,leve): print( '页面便签 %s -- 地址等级 %s --- url %s \n' % (classname,leve,url)) soup = self.getUrl(url) if soup is None: return None classes = soup.find_all(name='tr', attrs={'class': classname}) # 按照字典的形式给attrs参数赋值 list = [] for classesoup in classes: group = classesoup.find_all('a') entity = areainfo() entity.leve = leve entity.parentcode = parnetcode if len(group) > 0: entity.href = group[0]['href'] entity.areacode = group[0].string entity.areaname = group[1].string else: tds = classesoup.find_all('td') entity.href = '' if len(tds)==2 : entity.areacode = tds[0].string entity.areaname = tds[1].string if len(tds)==3: entity.areacode = tds[0].string entity.areaname = tds[2].string entity.parentcode = parnetcode list.append(entity)     return list

    网页中,每一层级区划信息的便签不同,可使用浏览器F12进入调试模式识别。BeautifulSoup 通过对标签class提取,获取需要的区划信息数据。

    eg

    图片

    2.3 区划信息提取

    各等级区划信息提取,分别调用2.2的方法进行解析。每个方法返回地址list

        '''    获取一级省份 ''' def getPronvice(self):     soup = self.getUrl(self.base)     if soup is None :         return None     provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'})  # 按照字典的形式给attrs参数赋值     provinceList=[]     for provincesoup in provincesoups:         for k in provincesoup.find_all('a'):             province = areainfo()             province.href=k['href']             province.areaname= k.get_text()             province.areacode= k['href'].replace('.html','0000')             province.parentcode='0'             province.leve = '1'             print(province.__dict__)             provinceList.append(province)     return provinceList '''     获取二级城市 ''' def getCity(self,parent):     url=self.base + parent.href     list =self.initAreainfo(url,'citytr',parent.areacode,'2')     return list '''    获取三级城市 ''' def getCounty(self,parent):     url = self.base + parent.href     list  = self.initAreainfo(url,'countytr',parent.areacode,'3')     return  list '''    获取四级地址 ''' def getTown(self,parent):     url = parent.href     if url=='' :         return None     url = self.base + parent.areacode[0:2]+'/'+parent.href     list = self.initAreainfo(url,'towntr',parent.areacode,'4')     return  list '''   获取五级地址 ''' def getVillagetr(self,parent):     url = parent.href     if url=='' :         return None     url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href     list = self.initAreainfo(url,'villagetr',parent.areacode,'5')     return  list

    2.4 省份数据封装

    获取一个省下边所有地址数据

    ''' 通过省份获取该省份下所有地址信息 ''' def finAllPronvinceCity(self,pro,dir): listall=[] listall.append(pro) citylist = self.getCity(pro) for city in citylist : listall.append(city) #print(city.__dict__) conlist = self.getCounty(city) if conlist is not None : for county in conlist: #print(county.__dict__) listall.append(county) townlist = self.getTown(county) if townlist is not None: for town in townlist: #print(town.__dict__) listall.append(town) villagelist = self.getVillagetr(town) if villagelist is not None: listall.extend(villagelist) df = pd.DataFrame([x.as_dict() for x in listall]) #print(df) isExists = os.path.exists(dir) if not isExists: os.makedirs(dir) filepath = os.path.join(dir,pro.areaname+'.xlsx'); writer = pd.ExcelWriter(filepath) df.to_excel(writer, float_format='%.5f')       writer.save()2.5 线程封装
       '''       异步调用    '''    def ruanthread(self):        provinces = self.getPronvice()        for province in provinces:            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()
    2.6 万能的MAIN if __name__ == '__main__': china_city=china_city()   china_city.ruanthread()

    2.7 补充-err.log 数据处理

    构建新的方法,仅仅解析区划信息。该方法不太完善,仅参考

      def getCityOnly(self,url,str,leve):       list = self.initAreainfo(url,str,'',leve)       return  list
    输出数据def errFileRe(self): listother=[] with open('err.log', 'r') as file: line = file.readline() while line: # isspace()方法判断当该行是空行时,跳过该行 if line.isspace(): line = file.readline() else: ''' 不是空行时,对每一行进行的操作 ''' line = line.replace('\n', '') list = self.getCityOnly(line, 'villagetr', '5') listother.extend(list) line = file.readline()      return listother运行

    图片

    导出数据列表

    图片

    数据格式

    图片

    err.log日志:

    图片

    完整代码

    附上完整代码

    import  requestsfrom bs4 import BeautifulSoupimport pandas as pdimport threadingimport osclass areainfo():    def __init__(self):        self.areacode=''   #行政区划编码        self.areaname=''   #行政区划名称        self.parentcode='' #父级区划编码        self.leve=''       #地址级别        self.href=''       #连接地址    def as_dict(self):        return {'areacode': self.areacode, 'areaname': self.areaname, 'parentcode': self.parentcode,'leve': self.leve,'href': self.href}class china_city():    def __init__(self):        self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'    '''      获取web信息    '''    def getUrl(self,url):        try:            headers = {                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}            resp = requests.get(url, headers=headers)            resp.encoding = 'gbk'            text = resp.text            soup = BeautifulSoup(text, 'html.parser')            return  soup        #记录异常请求        except  Exception  as e:            print(e)            with open('err.log', 'a') as file:  # ”w'代表着每次运行都覆盖内容                file.write(url  + '\n')            return  None    '''       获取一级省份    '''    def getPronvice(self):        soup = self.getUrl(self.base)        if soup is None :            return None        provincesoups = soup.find_all(name='tr', attrs={'class': 'provincetr'})  # 按照字典的形式给attrs参数赋值        provinceList=[]        for provincesoup in provincesoups:            for k in provincesoup.find_all('a'):                province = areainfo()                province.href=k['href']                province.areaname= k.get_text()                province.areacode= k['href'].replace('.html','0000')                province.parentcode='0'                province.leve = '1'                print(province.__dict__)                provinceList.append(province)        return provinceList    '''        获取二级城市    '''    def getCity(self,parent):        url=self.base + parent.href        list =self.initAreainfo(url,'citytr',parent.areacode,'2')        return list    '''       获取三级城市    '''    def getCounty(self,parent):        url = self.base + parent.href        list  = self.initAreainfo(url,'countytr',parent.areacode,'3')        return  list    '''       获取四级地址    '''    def getTown(self,parent):        url = parent.href        if url=='' :            return None        url = self.base + parent.areacode[0:2]+'/'+parent.href        list = self.initAreainfo(url,'towntr',parent.areacode,'4')        return  list    '''      获取五级地址    '''    def getVillagetr(self,parent):        url = parent.href        if url=='' :            return None        url = self.base + parent.areacode[0:2]+'/'+parent.areacode[2:4]+'/'+parent.href        list = self.initAreainfo(url,'villagetr',parent.areacode,'5')        return  list    '''       soup解析    '''    def initAreainfo(self,url,classname,parnetcode,leve):        print( '页面便签 %s -- 地址等级 %s --- url  %s  \n' % (classname,leve,url))        soup = self.getUrl(url)        if soup is None:            return  None        classes = soup.find_all(name='tr', attrs={'class': classname})  # 按照字典的形式给attrs参数赋值        list = []        for classesoup in classes:            group = classesoup.find_all('a')            entity = areainfo()            entity.leve = leve            entity.parentcode = parnetcode            if len(group) > 0:                entity.href = group[0]['href']                entity.areacode = group[0].string                entity.areaname = group[1].string            else:                tds = classesoup.find_all('td')                entity.href = ''                if len(tds)==2 :                    entity.areacode = tds[0].string                    entity.areaname = tds[1].string                if len(tds)==3:                    entity.areacode = tds[0].string                    entity.areaname = tds[2].string                    entity.parentcode = parnetcode            list.append(entity)        return list    '''      通过省份获取该省份下所有地址信息    '''    def finAllPronvinceCity(self,pro,dir):        listall=[]        listall.append(pro)        citylist =  self.getCity(pro)        for city in citylist :            listall.append(city)            #print(city.__dict__)            conlist =  self.getCounty(city)            if conlist is not None :                for county in conlist:                    #print(county.__dict__)                    listall.append(county)                    townlist = self.getTown(county)                    if townlist is not None:                        for town in townlist:                            #print(town.__dict__)                            listall.append(town)                            villagelist = self.getVillagetr(town)                            if villagelist is not None:                                listall.extend(villagelist)        df = pd.DataFrame([x.as_dict() for x in listall])        #print(df)        isExists = os.path.exists(dir)        if not isExists:            os.makedirs(dir)        filepath = os.path.join(dir,pro.areaname+'.xlsx');        writer = pd.ExcelWriter(filepath)        df.to_excel(writer, float_format='%.5f')        writer.save()    '''       异步调用    '''    def ruanthread(self):        provinces = self.getPronvice()        for province in provinces:            threading.Thread(target= self.finAllPronvinceCity, args=(province,'F://areainfo')).start()if __name__ == '__main__':    china_city=china_city()    china_city.ruanthread()

    本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报。

    Powered by 富联娱乐 @2013-2022 RSS地图 HTML地图