from scrapy.selector import Selector
from scrapy.http import Request
from ..items import MysiteItem
#from scrapy.dupefilter import RFPDupeFilter
class XiaohuaSpider(scrapy.Spider):
name = 'xiaohua1'
allowed_domains = ["wx.dxs6.cn","www.","www.dxsabc.com"]
start_urls = ['http://www./hua/']
#page_set = set()
def parse(self, response):
item = MysiteItem()
info_list = Selector(response=response).\
xpath('//div[starts-with(@class,"item_list")]/div')
for obj in info_list:
name = obj.xpath(".//a/img/@alt").extract_first()
img = obj.xpath(".//a/img/@src").extract_first()
img_request = response.urljoin(img) #response.urljoin -->根據(jù)start_url自動(dòng)補(bǔ)全url
#例如 img 抓取到的鏈接為 update/18883004004.jpg 就會(huì)自動(dòng)補(bǔ)全成http://www./update/18883004004.jpg,如果是完成的http請(qǐng)求,則忽略----->follow 一樣
item["url_address"] = img_request
item["name"] = name
#交給pipelines做數(shù)據(jù)持久化
yield item
#yield response.follow(img,callback=parse)#對(duì)相對(duì)路徑進(jìn)行自動(dòng)補(bǔ)全
#獲取?;ňW(wǎng)深度
page_num = Selector(response = response).xpath('//*[@id="page"]/div/a/@href').extract()
for url in page_num:
# if url in self.page_set:
# pass
# #print(u"url已經(jīng)存在")
# else:
# self.page_set.add(url)
#Request scrapy引擎交給調(diào)度器繼續(xù)執(zhí)行
yield Request(url = url,callback = self.parse)
# def parse_datile(self,response):
#
# print("request------>", response.url)
# info_list = Selector(response=response). \
# xpath('//div[starts-with(@class,"item_list")]/div')
# for obj in info_list:
# name = obj.xpath(".//a/img/@alt").extract_first()
# img = obj.xpath(".//a/img/@src").extract_first()
#
# for url in self.img_url:
# #img_request = response.follow(url, callback=self.parse)
# img_request = response.urljoin(url)
# yield Request(url=img_request, callback=self.parse)
# item = MysiteItem()
# item["url_address"] = img_request
# yield item
#item 內(nèi)容 ----格式化數(shù)據(jù)
import scrapy
class MysiteItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url_address = scrapy.Field()
#pipelines內(nèi)容------》數(shù)據(jù)持久化(可以寫多個(gè)class 在settings中配置權(quán)重,數(shù)字小的先執(zhí)行)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc./en/latest/topics/item-pipeline.html
import sys,os
import codecs
import json
import requests
from scrapy.http import Request
class MyPipeline(object):
def __init__(self,picture):
#self.picture = os.path.join(os.path.dirname(os.path.abspath(__file__)), "img_picture")
self.picture = picture
#處理爬取的數(shù)據(jù)
def process_item(self, item, spider):
response = requests.get(item["url_address"])
picture_img = self.picture + "\\" + item["name"] + ".jpg"
with open(picture_img,"wb") as f_write:
f_write.write(response.content)
#爬取開始前執(zhí)行
def open_spider(self,spider):
print("開始爬取。。。。。。。。。。。。。。。。。。。。。")
if not os.path.exists(self.picture):
os.mkdir(self.picture)
#爬取結(jié)束后執(zhí)行
def close_spider(self,spider):
print("結(jié)束爬取")
@classmethod
def from_crawler(cls,crawler):
picture = crawler.settings.get("IMG_PICTURE") #獲取settings文件 自定義的數(shù)據(jù)
return cls(picture) #cls是類名,實(shí)例化MyPipeline