日韩黑丝制服一区视频播放|日韩欧美人妻丝袜视频在线观看|九九影院一级蜜桃|亚洲中文在线导航|青草草视频在线观看|婷婷五月色伊人网站|日本一区二区在线|国产AV一二三四区毛片|正在播放久草视频|亚洲色图精品一区

分享

Scrapy爬取?;ňW(wǎng)圖片

 小豬窩969 2019-01-03
#爬蟲內(nèi)容
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import MysiteItem
#from scrapy.dupefilter import RFPDupeFilter
class XiaohuaSpider(scrapy.Spider):
name = 'xiaohua1'
allowed_domains = ["wx.dxs6.cn","www.","www.dxsabc.com"]
start_urls = ['http://www./hua/']
#page_set = set()
def parse(self, response):
item = MysiteItem()
info_list = Selector(response=response).\
xpath('//div[starts-with(@class,"item_list")]/div')
for obj in info_list:
name = obj.xpath(".//a/img/@alt").extract_first()
img = obj.xpath(".//a/img/@src").extract_first()
img_request = response.urljoin(img) #response.urljoin -->根據(jù)start_url自動(dòng)補(bǔ)全url
	    #例如 img 抓取到的鏈接為 update/18883004004.jpg  就會(huì)自動(dòng)補(bǔ)全成http://www./update/18883004004.jpg,如果是完成的http請(qǐng)求,則忽略----->follow 一樣
item["url_address"] = img_request
item["name"] = name
	   #交給pipelines做數(shù)據(jù)持久化
yield item
	    #yield response.follow(img,callback=parse)#對(duì)相對(duì)路徑進(jìn)行自動(dòng)補(bǔ)全



#獲取?;ňW(wǎng)深度
page_num = Selector(response = response).xpath('//*[@id="page"]/div/a/@href').extract()

for url in page_num:

# if url in self.page_set:
# pass
# #print(u"url已經(jīng)存在")
# else:
# self.page_set.add(url)
#Request scrapy引擎交給調(diào)度器繼續(xù)執(zhí)行
yield Request(url = url,callback = self.parse)

# def parse_datile(self,response):
#
# print("request------>", response.url)
# info_list = Selector(response=response). \
# xpath('//div[starts-with(@class,"item_list")]/div')
# for obj in info_list:
# name = obj.xpath(".//a/img/@alt").extract_first()
# img = obj.xpath(".//a/img/@src").extract_first()
#
# for url in self.img_url:
# #img_request = response.follow(url, callback=self.parse)
# img_request = response.urljoin(url)
# yield Request(url=img_request, callback=self.parse)
# item = MysiteItem()
# item["url_address"] = img_request
# yield item
#item 內(nèi)容 ----格式化數(shù)據(jù)

import scrapy


class MysiteItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url_address = scrapy.Field()
#pipelines內(nèi)容------》數(shù)據(jù)持久化(可以寫多個(gè)class 在settings中配置權(quán)重,數(shù)字小的先執(zhí)行)
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc./en/latest/topics/item-pipeline.html

import sys,os
import codecs
import json
import requests
from scrapy.http import Request

class MyPipeline(object):
def __init__(self,picture):
#self.picture = os.path.join(os.path.dirname(os.path.abspath(__file__)), "img_picture")
self.picture = picture
#處理爬取的數(shù)據(jù)
def process_item(self, item, spider):

response = requests.get(item["url_address"])

picture_img = self.picture + "\\" + item["name"] + ".jpg"
with open(picture_img,"wb") as f_write:
f_write.write(response.content)
#爬取開始前執(zhí)行
def open_spider(self,spider):
print("開始爬取。。。。。。。。。。。。。。。。。。。。。")
if not os.path.exists(self.picture):
os.mkdir(self.picture)
#爬取結(jié)束后執(zhí)行
def close_spider(self,spider):
print("結(jié)束爬取")
@classmethod
def from_crawler(cls,crawler):
picture = crawler.settings.get("IMG_PICTURE") #獲取settings文件 自定義的數(shù)據(jù)
return cls(picture) #cls是類名,實(shí)例化MyPipeline

    本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評(píng)論

    發(fā)表

    請(qǐng)遵守用戶 評(píng)論公約

    類似文章 更多