scrapy知乎模拟登录失败

anbency 发布于 2017/06/26 17:24
阅读 293
收藏 0

下面的代码模拟登录知乎,在登录的时候没有成功

check_login提示失败:

check_login
{
    "r": 1,
    "errcode": 1991829, 

"data": {"captcha":"\u9a8c\u8bc1\u7801\u4f1a\u8bdd\u65e0\u6548 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"},
    "msg": "\u9a8c\u8bc1\u7801\u4f1a\u8bdd\u65e0\u6548 :("
}

有没有兄弟有成功的经验,望指点一二,谢谢

# -*- coding: utf-8 -*-
import scrapy
import json

class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["www.zhihu.com"]
    headers = {
            'Host': 'www.zhihu.com',
            'Referer': 'http://www.zhihu.com',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
        }

    def start_requests(self):
        # 返回值必须是一个序列
        return [scrapy.Request('http://www.zhihu.com/#signin', callback=self.login)]

    def login(self, response):
        print('-------')     # 便于测试
        _xsrf = response.xpath('//input[@name="_xsrf"]/@value ').extract()[0]
        print(response.xpath('//input[@name="_xsrf"]/@value '))
        print(_xsrf)
        return [scrapy.FormRequest(
            url = 'http://www.zhihu.com/login/email',    # 这是post的真实地址
            formdata={
                '_xsrf': _xsrf,
                'email': 'xxxxxx',    # email
                'password': 'xxxxx',    # password
                'remember_me': 'true',
            },
            headers=self.headers,
            callback=self.check_login,
        )]

    def check_login(self, response):
        print("check_login")
        print(response.text)
        print("check_login ---")
        if json.loads(response.text)['r'] == 0:  ####这里提示登录失败
            yield scrapy.Request(
                                'http://www.zhihu.com',
                                headers=self.headers,
                                callback=self.page_content,
                                dont_filter=True,    
                                )

    def page_content(self, response):
        with open('first_page.html', 'wb') as f:
            f.write(response.body)
        print('done')

加载中
0
南寻
南寻

试试这个

# encoding=utf-8
import requests
import re
import sys
#设置请求头
headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch, br',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Connection':'keep-alive',
    'Host':'www.zhihu.com',
    'Origin':'https://www.zhihu.com',
    'Referer':'https://www.zhihu.com/',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'x-hd-token':'hello',
}


#下面写入账号密码

post_data={
    '_xsrf':'***',
    'password':'****',
    'captcha':'***',
    'phone_num':'*****',
}

req=requests.Session()

def login():
    page=req.get(url="https://www.zhihu.com/#signin",headers=headers)
    parser=re.compile(u'<input type="hidden" name="_xsrf" value="(.*?)"/>',re.S)
    xsrf=re.findall(parser,page.text)[0]
    headers['X-Xsrftoken']=xsrf
    post_data['_xsrf']=xsrf
    #下载验证码
    with open("../code.jpg",'wb') as w:
        p=req.get(url="https://www.zhihu.com/captcha.gif?r=1495546872530&type=login",headers=headers)
        w.write(p.content)

    code=input("请输入验证码:")
    if not code:
        sys.exit(1)
    post_data['captcha']=code
    res=req.post(url='https://www.zhihu.com/login/phone_num',data=post_data,headers=headers)
    print(res.text)
    return req

cookie=login().cookies.get_dict()

参考参考这个项目https://git.oschina.net/nanxun/zhihu.git

a
anbency
另外能不能共享下数据库的user表单,我的似乎有些问题 "Incorrect string value: '\\xE8\\xB5\\xB0\\xE9\\xA9\\xAC...' for column 'nic_name' 非常感谢你的答复以及分享,谢谢
a
anbency
想请教下: 我在浏览器上看到的验证码是要求选择倒立的文字,而你的项目里面抓出来的是英文字母,看了下url会有不同: https://www.zhihu.com/captcha.gif?r=1498526246164&type=login https://www.zhihu.com/captcha.gif?r=1498526246164&type=login&lang=cn 想知道这个是怎么发现的?
返回顶部
顶部