1
回答

python3.6
scrapy 1.4.0
断点调试时可以看到数据,但是命令行导出csv文件为空。
spider文件:
import scrapy
import re
from lxml import etree
from scrapy.http import Request
from dingdian.items import DingdianItem
########################################################################
class myspider(scrapy.Spider):
""""""
name= 'dingdian'
#allow_domains =['x23us.com']
base_url ='http://www.23us.so/list/'
#----------------------------------------------------------------------
def __init__(self):
"""Constructor"""
#----------------------------------------------------------------------
def start_requests(self):
""""""
for i in range(1,2):
#url =self.base_url +str(i)+'_1.html'
url ='http://www.23us.so/list/1_1.html'
yield Request(url,self.parse)
#yield Request('http://www.x23us.com/quanben/1',self.parse)
def parse(self,response):
#print(response.url) #初始网址正确
#print(response.text)
#pattern =re.compile('<a href=.*?" class="last">(.*?)</a>')
#pageIndex =re.findall(pattern, response)
pageIndex= response.xpath('//a[@class="last"]/text()').extract()
print(pageIndex)
baseurl = str(response.url)[:-7]
for num in range(1,int(pageIndex[0])-200):
url =baseurl+'_'+str(num) +'.html'
yield Request(url,callback=self.getname)
#----------------------------------------------------------------------
def getname(self,response):
""""""
#contents= response.xpath('//a[@class="last"]/text()').extract()
#print(pageIndex)
tds =response.xpath('//tr[@bgcolor="#FFFFFF"]')
for td in tds:
novelname = td.xpath('./td[@class="L"]/a/text()').extract()
novelurl =td.xpath('./td[@class="L"]/a/@href')[0].extract()
yield Request(novelurl, callback=self.getcontent, meta={'name':novelname,
'url':novelname})
#----------------------------------------------------------------------
def getcontent(self,response):
""""""
item = DingdianItem()
item['name']=str(response.meta['name'])
item['novelurl']=response.url
#novelurl = response.url
tds =response.xpath('//table[@id="at"]')
for td in tds:
#author =td.xpath('//tr[1]/td[2]/text()').extract()
item['author'] =td.xpath('//tr[1]/td[2]/text()').extract()
#serialstatus=td.xpath('//tr[1]/td[3]/text()').extract()
item['serialstatus'] =td.xpath('//tr[1]/td[3]/text()').extract()
#lastupdatatime =td.xpath('//tr[2]/td[3]/text()').extract()
item['lastupdatatime'] =td.xpath('//tr[2]/td[3]/text()').extract()
#like = td.xpath('//tr[2]/td[1]/text()').extract()
item['like'] =td.xpath('//tr[2]/td[1]/text()').extract()
print(author,novelurl,serialstatus,lastupdatatime,like,)
#item['author'] = response.xpath('//tbody/tr/td[1]')
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
import json
import codecs
import sys
class DingdianPipeline(object):
#----------------------------------------------------------------------
def __init__(self):
self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8')
def process_item(self, item, spider):
pass
##link_url = item['link_url']
#file_name = link_url[7:-6].replace('/','_')
#file_name += ".txt"
#fp = open("dingdianspider.txt", 'w')
#fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n")
#fp.close()
#return item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
import json
import codecs
import sys
class DingdianPipeline(object):
#----------------------------------------------------------------------
def __init__(self):
self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8')
def process_item(self, item, spider):
pass
##link_url = item['link_url']
#file_name = link_url[7:-6].replace('/','_')
#file_name += ".txt"
#fp = open("dingdianspider.txt", 'w')
#fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n")
#fp.close()
#return item
item文件:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DingdianItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
author = scrapy.Field()
novelurl =scrapy.Field()
serialstatus =scrapy.Field()
lastupdatatime=scrapy.Field()
like =scrapy.Field()
#name_id =scrapy.Field()