1
回答
求助 scrapy 调试时可以看到数据爬去成功,但是命令行csv文件导出为空
利用AWS快速构建适用于生产的无服务器应用程序,免费试用12个月>>>   

 

python3.6

scrapy 1.4.0

断点调试时可以看到数据,但是命令行导出csv文件为空。

spider文件:

import scrapy
import re
from lxml import etree
from scrapy.http import Request
from dingdian.items import DingdianItem


########################################################################
class myspider(scrapy.Spider):
    """"""
    name= 'dingdian'
    #allow_domains =['x23us.com']
    base_url ='http://www.23us.so/list/'
    
    
    #----------------------------------------------------------------------
    def __init__(self):
        """Constructor"""
    #----------------------------------------------------------------------
    def start_requests(self):
        """"""
        
        for i in range(1,2):
            #url =self.base_url +str(i)+'_1.html'
            url ='http://www.23us.so/list/1_1.html'
            yield Request(url,self.parse)
        #yield Request('http://www.x23us.com/quanben/1',self.parse)
            
    def parse(self,response):
        #print(response.url)     #初始网址正确
        #print(response.text)
        #pattern =re.compile('<a href=.*?" class="last">(.*?)</a>')
        #pageIndex =re.findall(pattern, response)
        pageIndex= response.xpath('//a[@class="last"]/text()').extract()
        print(pageIndex)        
        baseurl = str(response.url)[:-7]
        for num in range(1,int(pageIndex[0])-200):
            url =baseurl+'_'+str(num) +'.html'
            yield Request(url,callback=self.getname)

    #----------------------------------------------------------------------
    def getname(self,response):
        """"""
        #contents= response.xpath('//a[@class="last"]/text()').extract()
        #print(pageIndex)
        tds =response.xpath('//tr[@bgcolor="#FFFFFF"]')
        for td in tds:
            novelname = td.xpath('./td[@class="L"]/a/text()').extract()
            novelurl =td.xpath('./td[@class="L"]/a/@href')[0].extract()
            yield Request(novelurl, callback=self.getcontent, meta={'name':novelname,
                                                                    'url':novelname})
    
    #----------------------------------------------------------------------
    def getcontent(self,response):
        """"""
        item = DingdianItem()
        item['name']=str(response.meta['name'])
        item['novelurl']=response.url
        #novelurl = response.url
        tds =response.xpath('//table[@id="at"]')
        for td in tds:
            #author =td.xpath('//tr[1]/td[2]/text()').extract()
            item['author'] =td.xpath('//tr[1]/td[2]/text()').extract()
            #serialstatus=td.xpath('//tr[1]/td[3]/text()').extract()
            item['serialstatus'] =td.xpath('//tr[1]/td[3]/text()').extract()
            #lastupdatatime =td.xpath('//tr[2]/td[3]/text()').extract()
            item['lastupdatatime'] =td.xpath('//tr[2]/td[3]/text()').extract()
            #like = td.xpath('//tr[2]/td[1]/text()').extract()
            item['like'] =td.xpath('//tr[2]/td[1]/text()').extract()
            print(author,novelurl,serialstatus,lastupdatatime,like,)
        #item['author'] = response.xpath('//tbody/tr/td[1]')
            yield item
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy import signals  
import json  
import codecs  
import sys  


class DingdianPipeline(object):
    #----------------------------------------------------------------------    
    def __init__(self):  
        self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8')  
  
    def process_item(self, item, spider):  
        pass      
        ##link_url = item['link_url']  
        #file_name = link_url[7:-6].replace('/','_')  
        #file_name += ".txt"  
        #fp = open("dingdianspider.txt", 'w')  
        #fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n")  
        #fp.close()  
        #return item  

 

 

 

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy import signals  
import json  
import codecs  
import sys  


class DingdianPipeline(object):
    #----------------------------------------------------------------------    
    def __init__(self):  
        self.file = codecs.open('dingdian.json', mode='wb', encoding='utf-8')  
  
    def process_item(self, item, spider):  
        pass      
        ##link_url = item['link_url']  
        #file_name = link_url[7:-6].replace('/','_')  
        #file_name += ".txt"  
        #fp = open("dingdianspider.txt", 'w')  
        #fp.write(item['name'],item['author'],item['novelurl'],item['serialstatus'],r"\n")  
        #fp.close()  
        #return item  

item文件:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DingdianItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    author = scrapy.Field()
    novelurl =scrapy.Field()
    serialstatus =scrapy.Field()
    lastupdatatime=scrapy.Field()
    like =scrapy.Field()
    #name_id =scrapy.Field()

 

举报
12312dfa
发帖于3个月前 1回/86阅
顶部