python爬虫 爬取学校教学服务系统 超时

BigGhost 发布于 2015/05/27 10:58
阅读 1K+
收藏 0
import urllib.request
import http.cookiejar
import re
from bs4 import BeautifulSoup
import hashlib
import socket
import time
#加密学号
def chkyhm(id):
    result = []
    i=0
    while ( i<len(id) ):
        result.append(ord(id[i])^0x12)
        if ( i<len(id)-1 ):
            result.append( '.' )
        i=i+1
    return result
#md5封装
def md5(string):
    m = hashlib.md5()   
    m.update(string.encode())
    return m.hexdigest()
#加密密码
def chkpwd(pwd):
    schoolcode='10532'
    yhm='201308010104'
    s=md5(yhm+md5(pwd)[0:30].upper()+schoolcode)[0:30].upper()
    return s
#加密验证码
def chkyzm(yzm):
    schoolcode='10532'
    s=md5(md5(yzm.upper())[0:30].upper()+schoolcode)[0:30].upper()
    return s


class HNU:
    #初始化:url、cookies、headers
    def __init__(self):

        self.loginUrl='http://hdjw.hnu.cn/_data/index_login.aspx'
        
        self.loginPage='http://hdjw.hnu.cn/default.aspx'
        #cookies的使用
        self.cookies=http.cookiejar.MozillaCookieJar()
        self.handler=urllib.request.HTTPCookieProcessor(self.cookies)
        self.opener=urllib.request.build_opener(self.handler)
        self.headers={
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding' : 'gzip, deflate',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cache-Control':'max-age=0',
            'Content-Length':'5330',
            'Content-Type':'application/x-www-form-urlencoded',
            'Cookie':'ASP.NET_SessionId=xuwklzyk34hfg5za5rim2n55',
            'Host':'hdjw.hnu.cn',
            'Origin:http':'//hdjw.hnu.cn',
            'Proxy-Connection':'keep-alive',
            'Referer:http':'//hdjw.hnu.cn/_data/index_login.aspx',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
            }
        
    '''   
    def getInitPage(self):
        
        try:
            response=self.opener.open(self.loginPage)
            data=response.read().decode('gbk')
            return data
        except urllib.error.URLError as error:
            print ('连接湖南大学教学服务系统失败,错误原因:'),error.reason
            return None
    '''

    #手动获取验证码
    def getSecurityCode(self):
        
        im_url='http://hdjw.hnu.cn/sys/ValidateCode.aspx'
        im_data=self.opener.open(im_url).read()
        f=open('Code.png','wb')
        f.write(im_data)
        f.close()
        validateCode=input()
        #print (self.cookies)
        return str(validateCode)

    #构造post数据,提交
    def getPostPage(self, code, username, password,s1,s2,s3):
        
        postDict={'__VIEWSTATE':'',
                  'Sel_Type':'STU',
                'txt_dsdsdsdjkjkjc':username,
                'txt_dsdfdfgfouyy':password,
                'txt_ysdsdsdskgf':code,
                  'pcInfo':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSERundefined5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER SN:NULL',
                  'typeName':'学生',
                  'aerererdsdxcxdfgfg':s1,
                  'efdfdfuuyyuuckjg':s2,
                  'werereruuyyuxcxcx':s3
        
            }
        postData = urllib.parse.urlencode(postDict).encode()
        try:
            timeout = 30   
            socket.setdefaulttimeout(timeout)#这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置  
            sleep_download_time = 30  
            time.sleep(sleep_download_time) #这里时间自己设定  
            req = urllib.request.Request(self.loginUrl, postData,self.headers)
            response = self.opener.open(req)
            data = response.read().decode('gbk')
            req.close()
            return data
        except urllib.error.URLError as error:
            print (error.reason)
            return None
        
    
    
#test main
string=''
for each_item in chkyhm('201308010104'):
    string+=str(each_item)
#print (string)

#print (chkpwd(''))

#print (chkyzm('EC75'))


hnu=HNU()
code=hnu.getSecurityCode()
s3=string
s2=chkpwd('XXXXXXXXX')
s1=chkyzm(code)
data=hnu.getPostPage(code, "XXXXXXXXXX", "XXXXXXXXX",s1,s2,s3)
print (data)


POST表单提交的东西大概是9个,后面三个MD5加密处理过了,手动输入验证码后,等待很长的时间后出现:

WPD7
Traceback (most recent call last):
  File "G:\Python\Python3\Spider_HNU\Spider_HNU_JXFW.py", line 134, in <module>
    data=hnu.getPostPage(code, "201308010104", "242833",s1,s2,s3)
  File "G:\Python\Python3\Spider_HNU\Spider_HNU_JXFW.py", line 108, in getPostPage
    response = self.opener.open(req)
  File "C:\Python34\lib\urllib\request.py", line 463, in open
    response = self._open(req, data)
  File "C:\Python34\lib\urllib\request.py", line 481, in _open
    '_open', req)
  File "C:\Python34\lib\urllib\request.py", line 441, in _call_chain
    result = func(*args)
  File "C:\Python34\lib\urllib\request.py", line 1210, in http_open
    return self.do_open(http.client.HTTPConnection, req)
  File "C:\Python34\lib\urllib\request.py", line 1185, in do_open
    r = h.getresponse()
  File "C:\Python34\lib\http\client.py", line 1171, in getresponse
    response.begin()
  File "C:\Python34\lib\http\client.py", line 351, in begin
    version, status, reason = self._read_status()
  File "C:\Python34\lib\http\client.py", line 313, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Python34\lib\socket.py", line 374, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out



总之就是timed out。。。

加载中
0
MrZQ
MrZQ
用requests试试~
BigGhost
BigGhost
嗯嗯~因为要学习基础所以用的是urllib库,现在已经几句解决了,header信息太多了所以超时
返回顶部
顶部