任務(wù)目標(biāo): 1. 抓取不同類型的圖片 2. 編寫一個(gè)GUI界面爬蟲(chóng)程序,打包成exe可執(zhí)行文件 3. 遇到的難點(diǎn) 1. 分析如何抓取不同類型的圖片 首先打開(kāi)網(wǎng)站,可以看到有如下6個(gè)類型的菜單 點(diǎn)擊不同菜單,發(fā)現(xiàn)url顯示如下 大胸妹:https://www./?cid=2 小翹臀:https://www./?cid=6 可以看到每個(gè)類型圖片對(duì)應(yīng)不同的cid值 所以要想抓取不同類型的圖片,只需要構(gòu)造下url 將cid進(jìn)行參數(shù)化,然后傳給url即可 具體代碼在下面給出 2. 利用tkinter進(jìn)行GUI編程 之前有寫過(guò)一些tkinter編程的隨筆 例如 利用python制作一個(gè)翻譯工具 先來(lái)看一下這次設(shè)計(jì)的程序最終頁(yè)面布局, 然后再具體講下如何實(shí)現(xiàn)的,頁(yè)面布局如下: 1. 選擇圖片存儲(chǔ)路徑 抓取到的圖片要保存到電腦本地,所以就想著最好能夠自己選取本地任意一個(gè)文件夾作為存儲(chǔ)路徑 后來(lái)網(wǎng)上沖浪一番發(fā)現(xiàn)tkinter是可以實(shí)現(xiàn)這個(gè)功能的 可以通過(guò)tkinter.filedialog模塊中的askdirectory()方法實(shí)現(xiàn) 下面是在網(wǎng)上找到的一段示例代碼 #coding:UTF-8
from tkinter import *
from tkinter.filedialog import askdirectory
def select_path():
path_ = askdirectory()
path.set(path_)
root = Tk()
path = StringVar()
Label(root,text = '目標(biāo)路徑:').grid(row = 0, column = 0)
Entry(root, textvariable = path).grid(row = 0, column = 1)
Button(root, text = '路徑選擇', command = select_path).grid(row = 0, column = 2)
root.mainloop()
效果如下 具體到這個(gè)例子, (1)定義一個(gè)文本框,用來(lái)存放(顯示)選擇的存儲(chǔ)路徑 self.input = tk.Entry(self.window, textvariable = self.path, widt
(2)定一個(gè)按鈕,來(lái)觸發(fā)選擇本地路徑功能 self.t_button = tk.Button(self.window, text='選擇路徑', relief=tk.RAISED, width=8, height=1, command=self.select_Path)
(3)定義一個(gè)函數(shù),來(lái)實(shí)現(xiàn)選取路徑功能
def select_Path(self):
'''選取本地路徑'''
path_ = askdirectory()
self.path.set(path_)
后續(xù)保存圖片時(shí),路徑可以直接使用前面定義好的的self.input中的值 2. 選擇分類 因?yàn)閳D片分為了6個(gè)類別,每個(gè)類別對(duì)應(yīng)一個(gè)cid值,所以可以事先把cid抽象出來(lái),當(dāng)作參數(shù)傳遞
(1)定義一個(gè)下拉框,存儲(chǔ)圖片類型 self.menu['value'] = ('大胸妹','小翹臀', '黑絲襪', '美腿控', '有顏值','大雜燴')
(2)根據(jù)所選類型不同,返回不同的cid值 def get_cid(self):
category = {
'DX': 2,
'XQT': 6,
'HSW': 7,
'MTK': 3,
'YYZ': 4,
'DZH': 5
}
cid = None
if self.menu.get() == '大胸妹':
cid = category['DX']
elif self.menu.get() == '小翹臀':
cid = category['XQT']
elif self.menu.get() == '黑絲襪':
cid = category['HSW']
elif self.menu.get() == '美腿控':
cid = category['MTK']
elif self.menu.get() == '有顏值':
cid = category['YYZ']
elif self.menu.get() == '大雜燴':
cid = category['DZH']
return cid
3. 填寫爬取頁(yè)數(shù)
self.page = tk.Entry(self.window, width=5) # 定義一個(gè)文本框,存放爬取頁(yè)數(shù)
后面把這個(gè)文本框中的值傳給url即可 整體效果如下
最后附上完整代碼:
import requests
from requests.exceptions import RequestException
import tkinter as tk
from tkinter import ttk
from bs4 import BeautifulSoup
import bs4
from tkinter import *
from tkinter.filedialog import askdirectory
import os
class DB():
def __init__(self):
self.window = tk.Tk() #創(chuàng)建window窗口
self.window.title('Crawler Pics') # 定義窗口名稱
# self.window.resizable(0,0) # 禁止調(diào)整窗口大小
self.menu = ttk.Combobox(self.window,width=6)
self.path = StringVar()
self.lab1 = tk.Label(self.window, text = '目標(biāo)路徑:')
self.lab2 = tk.Label(self.window, text='選擇分類:')
self.lab3 = tk.Label(self.window, text='爬取頁(yè)數(shù):')
self.page = tk.Entry(self.window, width=5) # 定義一個(gè)文本框,存放爬取頁(yè)數(shù)
self.input = tk.Entry(self.window, textvariable = self.path, width=80) # 創(chuàng)建一個(gè)輸入框,顯示圖片存放路徑
self.info = tk.Text(self.window, height=20) # 創(chuàng)建一個(gè)文本展示框,并設(shè)置尺寸
self.menu['value'] = ('大胸妹','小翹臀', '黑絲襪', '美腿控', '有顏值','大雜燴')
self.menu.current(0)
# 添加一個(gè)按鈕,用于選擇圖片保存路徑
self.t_button = tk.Button(self.window, text='選擇路徑', relief=tk.RAISED, width=8, height=1, command=self.select_Path)
# 添加一個(gè)按鈕,用于觸發(fā)爬取功能
self.t_button1 = tk.Button(self.window, text='爬取', relief=tk.RAISED, width=8, height=1,command=self.download)
# 添加一個(gè)按鈕,用于觸發(fā)清空輸出框功能
self.c_button2 = tk.Button(self.window, text='清空輸出', relief=tk.RAISED,width=8, height=1, command=self.cle)
def gui_arrang(self):
'''完成頁(yè)面元素布局,設(shè)置各部件的位置'''
self.lab1.grid(row=0,column=0)
self.lab2.grid(row=1, column=0)
self.menu.grid(row=1, column=1,sticky=W)
self.lab3.grid(row=2, column=0,padx=5,pady=5,sticky=tk.W)
self.page.grid(row=2, column=1,sticky=W)
self.input.grid(row=0,column=1)
self.info.grid(row=3,rowspan=5,column=0,columnspan=3,padx=15,pady=15)
self.t_button.grid(row=0,column=2,padx=5,pady=5,sticky=tk.W)
self.t_button1.grid(row=1,column=2)
self.c_button2.grid(row=0,column=3,padx=5,pady=5,sticky=tk.W)
def get_cid(self):
'''選擇爬取圖片類型'''
category = {
'DX': 2,
'XQT': 6,
'HSW': 7,
'MTK': 3,
'YYZ': 4,
'DZH': 5
}
cid = None
if self.menu.get() == '大胸妹':
cid = category['DX']
elif self.menu.get() == '小翹臀':
cid = category['XQT']
elif self.menu.get() == '黑絲襪':
cid = category['HSW']
elif self.menu.get() == '美腿控':
cid = category['MTK']
elif self.menu.get() == '有顏值':
cid = category['YYZ']
elif self.menu.get() == '大雜燴':
cid = category['DZH']
return cid
def select_Path(self):
'''選取本地路徑'''
path_ = askdirectory()
self.path.set(path_)
def get_html(self, url, header=None):
'''請(qǐng)求初始url'''
response = requests.get(url, headers=header)
try:
if response.status_code == 200:
# print(response.status_code)
# print(response.text)
return response.text
return None
except RequestException:
print('請(qǐng)求失敗')
return None
def parse_html(self, html, list_data):
'''提取img的名稱和圖片url,并將名稱和圖片地址以字典形式返回'''
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all('img')
for t in img:
if isinstance(t, bs4.element.Tag):
# print(t)
name = t.get('alt')
img_src = t.get('src')
list_data.append([name, img_src])
dict_data = dict(list_data)
return dict_data
def get_image_content(self, url):
'''請(qǐng)求圖片url,返回二進(jìn)制內(nèi)容'''
print('正在下載', url)
self.info.insert('end','正在下載:' url '\n')
try:
r = requests.get(url)
if r.status_code == 200:
return r.content
return None
except RequestException:
return None
def download(self):
base_url = 'https://www./?'
for i in range(1, int(self.page.get()) 1):
url = base_url 'cid=' str(self.get_cid()) '&' 'page=' str(i)
# print(url)
header = {
'Accept': 'text/html,application/xhtml xml,application/xml;q = 0.9, image/webp,image/apng,*/*;q='
'0.8',
'Accept-Encoding': 'gzip,deflate,br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.dbmeinv.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/'
'70.0.3538.102Safari/537.36 '
}
list_data = []
html = self.get_html(url)
# print(html)
dictdata = self.parse_html(html, list_data)
root_dir = self.input.get()
case_list = ['大胸妹', '小翹臀', '黑絲襪', '美腿控', '有顏值', '大雜燴']
for t in case_list:
if not os.path.exists(root_dir '/pics'):
os.makedirs(root_dir '/pics')
if not os.path.exists(root_dir '/pics/' str(t)):
os.makedirs(root_dir '/pics/' str(t))
if self.menu.get() == '大胸妹':
save_path = root_dir '/pics/' '大胸妹'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except FileNotFoundError:
continue
elif self.menu.get() == '小翹臀':
save_path = root_dir '/pics/' '小翹臀'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except FileNotFoundError:
continue
elif self.menu.get() == '黑絲襪':
save_path = root_dir '/pics/' '黑絲襪'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except FileNotFoundError:
continue
elif self.menu.get() == '美腿控':
save_path = root_dir '/pics/' '美腿控'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except FileNotFoundError:
continue
elif self.menu.get() == '有顏值':
save_path = root_dir '/pics/' '有顏值'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except OSError:
continue
elif self.menu.get() == '大雜燴':
save_path = root_dir '/pics/' '大雜燴'
for t in dictdata.items():
try:
# file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
file_path = save_path '/' t[0] 'q' '.jpg'
if not os.path.exists(file_path): # 判斷是否存在文件,不存在則爬取
with open(file_path, 'wb') as f:
f.write(self.get_image_content(t[1]))
f.close()
print('文件保存成功')
except FileNotFoundError:
continue
def cle(self):
'''定義一個(gè)函數(shù),用于清空輸出框的內(nèi)容'''
self.info.delete(1.0,'end') # 從第一行清除到最后一行
def main():
t = DB()
t.gui_arrang()
tk.mainloop()
if __name__ == '__main__':
main()
|