0
回答
scrapy 模拟登录知乎遇到验证码不通过的问题
终于搞明白,存储TCO原来是这样算的>>>   
# -*- coding: utf-8 -*-
import scrapy
from scrapy.shell import inspect_response
from scrapy.http import Request, FormRequest
#from scrapy.selector import Selector
import requests
import time
import json

class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://zhihu.com/']
    referer_url = "https://www.zhihu.com/"
    login_url = "https://www.zhihu.com/login/email"
    check_login_url = "https://www.zhihu.com/settings/profile"
    login_formdata = {
                        'email': 'xxxx',
                        'password': 'eee'
                     } 

    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests":"1",
        "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With":"XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36"
    }
    def start_requests(self):
        print('start_requests')
        self.headers['Referer'] = self.referer_url
        self.headers['Host'] = "www.zhihu.com"
        return [Request("https://www.zhihu.com", meta = {'cookiejar' : 1}, headers = self.headers,callback = self.post_login)] 
    
    def captcha_handler(self, response):
        t = str(int(time.time() * 1000))
        captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
        with open("code.jpg",'wb') as w:
            req=requests.Session()
            p=req.get(url=captcha_url,headers=self.headers)
            w.write(p.content)
            
        code=input("请输入验证码:")
        if not code:
            sys.exit(1)
        else:
            self.login_formdata["captcha"] = code
            
    def pre_signin_handler(self, response):
        #inspect_response(response,self)
        _xsrf = response.xpath('//input[@name="_xsrf"]/@value').extract_first()
        self.headers["X-Xsrftoken"] = _xsrf
        self.login_formdata['_xsrf'] = _xsrf
        print(self.login_formdata['_xsrf'])
        self.captcha_handler(response)
 
    def post_login(self, response):
        print('post_login')
        self.pre_signin_handler(response)
        print(self.login_formdata)
        #FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单
        return FormRequest("https://www.zhihu.com/login/email",meta={'cookiejar':response.meta['cookiejar']},
                            headers = self.headers,
                            formdata = {
                               '_xsrf':self.login_formdata['_xsrf'],
                               'password':'xxxx',
                               'email':'eeee',
                               'captcha':self.login_formdata["captcha"],
                            },
                            callback = self.__check_login_status,
                            )
        '''                        
        return [FormRequest.from_response(response,
                            meta ={'cookiejar' : 1},#cookies = self.cookies,
                            headers = self.headers,  #注意此处的headers
                            formdata = self.login_formdata,
                            callback = self.__check_login_status,
                            url = self.login_url,
                            dont_filter = True
                            )]
         '''                   
    def __check_login_status(self, response):
        # '用来检测是否登陆成功'
        print("----__check_login_status----")
        print(eval(response.text))
        if json.loads(response.text)['r'] == 0:
            print("登录成功")
        else:
            print("登录失败")
        #from scrapy.shell import inspect_response
        #inspect_response(response, self)

        yield Request(self.check_login_url, meta = {'cookiejar' : 1}, headers=self.headers,callback=self.parse_user_detail)
        
    def parse_user_detail(self, response):
        print("----parse_user_detail----")
        #with open('response_of_user_detil.html','wb') as file:
        #    file.write(response.body)

用scrapy模拟登录知乎会出现验证码失败的情况,返回:

{'data': {'captcha': '验证码会话无效 :(', 'name': 'ERR_VERIFY_CAPTCHA_SESSION_IN
VALID'}, 'msg': '验证码会话无效 :(', 'r': 1, 'errcode': 1991829}

 

哪位大神有用scrapy模拟登录过?清指点下,谢谢!

举报
anbency
发帖于2个月前 0回/101阅
顶部