抓取这个网站http://www.httpdaili.com 上的免费代理

qq1116767 发布于 2015/10/26 13:46
阅读 699
收藏 1
  • import urllib2
  • import re
  • import sys
  • import chardet
  • import threading
  • import time
  • #reload(sys)
  • #sys.setdefaultencoding('utf-8')

  • rawProxyList = []
  • checkedProxyList = []

  • targets = ['www.httpdaili.com/mfdl']

  • #正则
  • retext = '''<span class="tbBottomLine" style="width:140px;">[\r\n\s]*(.+?)[\r\n\s]+</span>[\r\n\s]*'''
  • retext += '''<span class="tbBottomLine" style="width:50px;">[\r\n\s]*(.+?)[\r\n\s]*</span>[\r\n\s]*'''
  • retext += '''<span class="tbBottomLine " style="width:70px;">[\r\n\s]*.+[\r\n\s]*</span>[\r\n\s]*'''
  • retext += '''<span class="tbBottomLine " style="width:70px;">[\r\n\s]*(.+?)[\r\n\s]*</span>[\r\n\s]*'''
  • p = re.compile(retext,re.M)

  • #获取代理的类
  • class ProxyGet(threading.Thread):
  •     def __init__(self,target):
  •         threading.Thread.__init__(self)
  •         self.target = target

  •     def getProxy(self):
  •         print "目标网站: " + self.target
  •         req = urllib2.urlopen(self.target)
  •         result = req.read()
  •         #print chardet.detect(result)
  •         matchs = p.findall(result)
  •         for row in matchs:
  •             ip = row[0]
  •             port = row[1]
  •             address = row[2].decode("utf-8".encode("gbk"
  •             proxy = [ip,port,address]
  •             #print proxy
  •             rawProxyList.append(proxy)

  •     def run(self):
  •         self.getProxy()


  • #检验代理的类
  • class ProxyCheck(threading.Thread):
  •     def __init__(self,proxyList):
  •         threading.Thread.__init__(self)
  •         self.proxyList = proxyList
  •         self.timeout = 5
  •         self.testUrl = "http://www.baidu.com/"
  •         self.testStr = "030173"

  •     def checkProxy(self):
  •         cookies = urllib2.HTTPCookieProcessor()
  •         for proxy in self.proxyList:
  •             proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(proxy[0],proxy[1])})
  •             #print r'http://%s:%s' %(proxy[0],proxy[1])
  •             opener = urllib2.build_opener(cookies,proxyHandler)
  •             opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1')]
  •             #urllib2.install_opener(opener)
  •             t1 = time.time()

  •             try:
  •                 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
  •                 req = opener.open(self.testUrl, timeout=self.timeout)
  •                 #print "urlopen is ok...."
  •                 result = req.read()
  •                 #print "read html...."
  •                 timeused = time.time() - t1
  •                 pos = result.find(self.testStr)
  •                 #print "pos is %s" %pos

  •                 if (pos > -1):
  •                     checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
  •                     #print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
  •                 else:
  •                     continue

  •             except Exception,e:
  •                 print e.message
  •                 continue

  •     def sort(self):
  •         sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3]))

  •     def run(self):
  •         self.checkProxy()
  •         self.sort()

  • if __name__ == "__main__":
  •     getThreads = []
  •     checkThreads = []

  •     #对每个目标网站开启一个线程负责抓取代理
  •     for i in range(len(targets)):
  •         t = ProxyGet(targets)
  •         getThreads.append(t)

  •     for i in range(len(getThreads)):
  •         getThreads.start()

  •     for i in range(len(getThreads)):
  •         getThreads.join()

  •     print ".......................总共抓取了%s个代理......................." %len(rawProxyList)


  •     #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
  •     for i in range(20):
  •         t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i(len(rawProxyList)+19)/20) * (i+1)])
  •         checkThreads.append(t)

  •     for i in range(len(checkThreads)):
  •         checkThreads.start()


  •     for i in range(len(checkThreads)):
  •         checkThreads.join()


  •     print ".......................总共有%s个代理通过校验......................." %len(checkedProxyList)

  •     #持久化
  •     f= open("D:\\t1.txt",'w+')
  •     for proxy in checkedProxyList:
  •         print "checked proxy is: %s:%s\t%s\t%s\n" %(proxy[0],proxy[1],proxy[2],proxy[3])
  •         f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3]))
  •     f.close()

加载中
返回顶部
顶部