0
回答
Scrapy爬取网页内容保存问题
【腾讯云】校园拼团福利,1核2G服务器10元/月!>>>   

数据应该是爬取成功了但是不知道为啥没有保存成功看的网课没办法问老师请各位大佬帮忙看下:

#获取上海深圳的部分信息
#从东方财富获取上深的各个股票名称和编号
#根据其列表和编号在百度股票中提取各股信息
#东方财富http://quote.eastmoney.com/stocklist.html
#百度股票https://gupiao.baidu.com/stock/

import scrapy
import re

class StocksSpider(scrapy.Spider):
    name = 'stocks'
    start_urls = ['http://quote.eastmoney.com/stocklist.html']

    def parse(self, response):
        for href in response.css('a::attr(href)').extract():
            try:
                names=re.findall(r'[s][hz]\d{6}')
                url='https://gupiao.baidu.com/stock/'+names+'html'
                yield response.Request(url,callback=self.parse_stock)
            except:
                continue
    def parse_stock(self,response):
        infoDict={}
        stockInfo=response.css('stock-bets')
        name=stockInfo.css('bets-name').extract()[0]
        keyName=stockInfo.css('dt').extract()
        keyValue=stockInfo.css('dd').extract()
        for i in range(len(keyName)):
            key=re.findall(r'<.*</dt>',keyName[i])[0][1:-5]
            try:
                value=re.findall(r'\d+\.?.<dd>',keyValue[i])[0][1:-5]
            except:
                value ='--'
            infoDict[key]=value
        infoDict.update({
            '股票名称':re.findall(r'\s.*\(',name)[0].split()[0]+\
                        re.findall(r'\<.*>',name)[0][1:-1]
        })
        yield infoDict

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class BaidustocksPipeline(object):
    def process_item(self, item, spider):
        return item
class BaidustocksInfoPipeline(object):
    def stocOpen(self,spider):
        self.f=open('new.txt','w')

    def stocClose(self,spider):
        self.f.close()

    def messStock(self,item,spider):
        try:
            line=str(dict(item))+'\n'
            self.f.write(line)
        except:
            pass
        return item

setting:



ITEM_PIPELINES = {
    'baiduStocks.pipelines.BaidustocksInfoPipeline': 300,
}

 

<无标签>
举报
小白老土豆
发帖于3个月前 0回/33阅
顶部