1)導入相關(guān)庫 import requests import pandas as pd from pprint import pprint from lxml import etree import time import warnings warnings.filterwarnings('ignore')
import requests import pandas as pd from pprint import pprint from lxml import etree import time import warnings warnings.filterwarnings('ignore')
for i in range(1,1501): print('正在爬取第' + str(i) + '頁的數(shù)據(jù)') url_pre = 'https://search./list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE,2,' url_end = '.html?' url = url_pre + str(i) + url_end headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } web = requests.get(url, headers=headers) web.encoding = 'gbk' dom = etree.HTML(web.text) # 1、崗位名稱 job_name = dom.xpath('//div[@class='dw_table']/div[@class='el']//p/span/a[@target='_blank']/@title') # 2、公司名稱 company_name = dom.xpath('//div[@class='dw_table']/div[@class='el']/span[@class='t2']/a[@target='_blank']/@title') # 3、工作地點 address = dom.xpath('//div[@class='dw_table']/div[@class='el']/span[@class='t3']/text()') # 4、工資 salary_mid = dom.xpath('//div[@class='dw_table']/div[@class='el']/span[@class='t4']') salary = [i.text for i in salary_mid] # 5、發(fā)布日期 release_time = dom.xpath('//div[@class='dw_table']/div[@class='el']/span[@class='t5']/text()') # 6、獲取二級網(wǎng)址url deep_url = dom.xpath('//div[@class='dw_table']/div[@class='el']//p/span/a[@target='_blank']/@href') RandomAll = [] JobDescribe = [] CompanyType = [] CompanySize = [] Industry = [] for i in range(len(deep_url)): web_test = requests.get(deep_url[i], headers=headers) web_test.encoding = 'gbk' dom_test = etree.HTML(web_test.text) # 7、爬取經(jīng)驗、學歷信息,先合在一個字段里面,以后再做數(shù)據(jù)清洗。命名為random_all random_all = dom_test.xpath('//div[@class='tHeader tHjob']//div[@class='cn']/p[@class='msg ltype']/text()') # 8、崗位描述性息 job_describe = dom_test.xpath('//div[@class='tBorderTop_box']//div[@class='bmsg job_msg inbox']/p/text()') # 9、公司類型 company_type = dom_test.xpath('//div[@class='tCompany_sidebar']//div[@class='com_tag']/p[1]/@title') # 10、公司規(guī)模(人數(shù)) company_size = dom_test.xpath('//div[@class='tCompany_sidebar']//div[@class='com_tag']/p[2]/@title') # 11、所屬行業(yè)(公司) industry = dom_test.xpath('//div[@class='tCompany_sidebar']//div[@class='com_tag']/p[3]/@title') # 將上述信息保存到各自的列表中 RandomAll.append(random_all) JobDescribe.append(job_describe) CompanyType.append(company_type) CompanySize.append(company_size) Industry.append(industry) # 為了反爬,設(shè)置睡眠時間 time.sleep(1) # 由于我們需要爬取很多頁,為了防止最后一次性保存所有數(shù)據(jù)出現(xiàn)的錯誤,因此,我們每獲取一夜的數(shù)據(jù),就進行一次數(shù)據(jù)存取。 df = pd.DataFrame() df['崗位名稱'] = job_name df['公司名稱'] = company_name df['工作地點'] = address df['工資'] = salary df['發(fā)布日期'] = release_time df['經(jīng)驗、學歷'] = RandomAll df['公司類型'] = CompanyType df['公司規(guī)模'] = CompanySize df['所屬行業(yè)'] = Industry df['崗位描述'] = JobDescribe # 這里在寫出過程中,有可能會寫入失敗,為了解決這個問題,我們使用異常處理。 try: df.to_csv('job_info.csv', mode='a+', header=None, index=None, encoding='gbk') except: print('當頁數(shù)據(jù)寫入失敗') time.sleep(1) print('數(shù)據(jù)爬取完畢,是不是很開心?。?!')
# 去重之前的記錄數(shù) print('去重之前的記錄數(shù)',df.shape) # 記錄去重 df.drop_duplicates(subset=['公司名','崗位名'],inplace=True) # 去重之后的記錄數(shù) print('去重之后的記錄數(shù)',df.shape)
job_info.shape target_job = ['算法', '開發(fā)', '分析', '工程師', '數(shù)據(jù)', '運營', '運維'] index = [df['崗位名'].str.count(i) for i in target_job] index = np.array(index).sum(axis=0) > 0 job_info = df[index] job_info.shape
job_info['工資'].str[-1].value_counts() job_info['工資'].str[-3].value_counts()
index1 = job_info['工資'].str[-1].isin(['年','月']) index2 = job_info['工資'].str[-3].isin(['萬','千']) job_info = job_info[index1 & index2]
def get_money_max_min(x): try: if x[-3] == '萬': z = [float(i)*10000 for i in re.findall('[0-9]+\.?[0-9]*',x)] elif x[-3] == '千': z = [float(i) * 1000 for i in re.findall('[0-9]+\.?[0-9]*', x)] if x[-1] == '年': z = [i/12 for i in z] return z except: return x
salary = job_info['工資'].apply(get_money_max_min) job_info['最低工資'] = salary.str[0] job_info['最高工資'] = salary.str[1] job_info['工資水平'] = job_info[['最低工資','最高工資']].mean(axis=1)
job_info.loc[job_info['公司類型'].apply(lambda x:len(x)<6),'公司類型'] = np.nan job_info['公司類型'] = job_info['公司類型'].str[2:-2]
job_info['學歷'] = job_info['經(jīng)驗與學歷'].apply(lambda x:re.findall('本科|大專|應屆生|在校生|碩士',x)) def func(x): if len(x) == 0: return np.nan elif len(x) == 1 or len(x) == 2: return x[0] else: return x[2] job_info['學歷'] = job_info['學歷'].apply(func)
#job_info['公司規(guī)模'].value_counts() def func(x): if x == '['少于50人']': return '<50' elif x == '['50-150人']': return '50-150' elif x == '['150-500人']': return '150-500' elif x == '['500-1000人']': return '500-1000' elif x == '['1000-5000人']': return '1000-5000' elif x == '['5000-10000人']': return '5000-10000' elif x == '['10000人以上']': return '>10000' else: return np.nan job_info['公司規(guī)模'] = job_info['公司規(guī)模'].apply(func)
import numpy as np import pandas as pd import re import jieba import warnings warnings.filterwarnings('ignore')
df = pd.read_excel(r'G:\8泰迪\python_project\51_job\new_job_info1.xlsx',encoding='gbk') df
def get_word_cloud(data=None, job_name=None): words = [] describe = data['工作描述'][data['崗位名'] == job_name].str[1:-1] describe.dropna(inplace=True) [words.extend(i.split(',')) for i in describe] words = pd.Series(words) word_fre = words.value_counts() return word_fre
zz = ['數(shù)據(jù)分析', '算法', '大數(shù)據(jù)','開發(fā)工程師', '運營', '軟件工程','運維', '數(shù)據(jù)庫','java','測試'] for i in zz: word_fre = get_word_cloud(data=df, job_name='{}'.format(i)) word_fre = word_fre[1:].reset_index()[:100] word_fre['崗位名'] = pd.Series('{}'.format(i),index=range(len(word_fre))) word_fre.to_csv(r'G:\8泰迪\python_project\51_job\詞云圖\bb.csv', mode='a',index=False, header=None,encoding='gbk') ![]() ![]() ![]() ![]() ![]() |
|
來自: 板橋胡同37號 > 《數(shù)據(jù)》