当前访客身份:游客 [ 登录 | 加入 OSCHINA ]

代码分享

当前位置:
代码分享 » Python  » 网络编程
pengyouya123

扫描代理

pengyouya123 发布于 2014年06月13日 19时, 1评/965阅
分享到: 
收藏 +0
2
注:不适用于windows。linux,osx均可,用于批量获取与验证代理,有些函数在外部定义没有给出,自行替换
标签: Python

代码片段(1) [全屏查看所有代码]

1. [代码]grab proxies     跳至 [1] [全屏预览]

# -*- coding: utf-8 -*-

# Desc: Grab proxy ip
# Date: 2014/06/13

import os
import urllib
from util import *
from bs4 import BeautifulSoup

log = getUniqueLog()
regions = ['China', 'America', 'Brazil', 'Japan', 'Twaiwan', 'Thailand', 'Bahrein']
baseUrl = 'http://www.proxy360.cn/Region/'


# if bypass in limit time return True
def ping(ip):
	cmd = "ping -Q -c 1 -W 2000 %s 1>/dev/null 2>&1"  % ip
	response = os.system(cmd)
	if response == 0:
		return True
	else:
		return False

def filterIp(fname):
	result = []
	with open(fname) as fp:
		for line in fp:
			line = line.strip()
			if (line is None) or (line == ''):
				continue

			segments = line.split(' ')
			ip = segments[0]
			if ping(ip):
				result.append(ip)
	
	print result

class Proxy():
	def __init__(self, address, port, hideprop, country, pubdate ):
		self.address = address
		self.port = port
		self.country = country
		self.pubdate = pubdate
		self.hideprop = hideprop

	def valid(self):
		return ping(self.address)

	def convertDict(self):
		return {'address':self.address, 'port':self.port, 'country':self.country, 'pubdate':self.pubdate, 'hideprop':self.hideprop}
	
	def initWithDict(proxyDict):
		return Proxy(proxyDict['address'], proxyDict['port'], proxyDict['country'], proxyDict['hideprop'], proxyDict['pubdate'])

	def __str__(self):
		# return self.address + self.port + self.country + self.hideprop + self.pubdate
		return '%s:%s %s %s %s' % (self.address, self.port, self.country, self.hideprop, self.pubdate)

# proxy360
def fetchProxies(region='China'):
	print '[Region: %s]' % region
	url = baseUrl + urllib.quote(region)
	print 'Fetching page ...'
	page = fetchPage(url)
	if page is None:
		print 'Fetch page failed'
		return

	print 'Analysising html ...'
	soup = BeautifulSoup(page)
	if soup is None:
		print 'parse html failed'
		return

	try:
		nodes = soup.find_all('div', class_ ='proxylistitem')
		validProxies = []
		total = len(nodes)
		cnt = 0
		validCnt = 0

		for node in nodes:
			cnt += 1
			print 'Dealing with (%d/%d:%d)th item ...' % (cnt, total, validCnt)
			proxyItems = node.find_all('span', class_ = 'tbBottomLine')
			proxy = Proxy(proxyItems[0].text.strip(), proxyItems[1].text.strip(), proxyItems[2].text.strip(), proxyItems[3].text.strip(), proxyItems[4].text.strip())
			print proxy
			if (proxy.valid()):
				validProxies.append(proxy.convertDict())
				validCnt += 1
			
	except Exception as e:
		print e
	finally:
		print 'Save proxies ...'
		if validProxies and (len(validProxies)>0):
			saveJson(validProxies, 'proxy_' + region.lower() + '.json')

	print 'Congratulation!'
# end fetch proxy

# proxy360.json
def testProxies(fname):
	with open(fname) as fp:
		data = json.load(fp)
		result = []
		for item in data:
			try:
				ip = item['address']
				port = item['port']
				print ip
				socket = '%s:%s' % (ip, port)
				proxy_handler = urllib2.ProxyHandler({'http':socket})
				proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
				opener = urllib2.build_opener(proxy_handler, proxy_auth_handler)
				print opener.open('http://20140507.ip138.com/ic.asp', timeout=3).read()
				result.append(ip)
			except Exception as e:
				#print e
				pass

		print result

# bypass
def checkProxy(ip, port):
	try:
		print ip, port
		socket = '%s:%s' % (ip, port)
		proxy_handler = urllib2.ProxyHandler({'http':socket})
		proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
		opener = urllib2.build_opener(proxy_handler, proxy_auth_handler)
		#print opener.open('http://20140507.ip138.com/ic.asp', timeout=3).read()
		print opener.open('http://www.twitter.com', timeout=4).read()

		return True
	except Exception as e:
		return False

	return True

# load from file: ip:port
def loadProxies(fname):
	with open(fname) as fp:
		result = []
		for line in fp:
			line = line.strip()
			if line == '':
				continue

			segments = line.split(':')
			ip = segments[0]
			port = segments[1]
			if checkProxy(ip, port):
				result.append({ip:port})
		print result

if __name__ == '__main__':
	timer = Timer()
	setupOpener()
	fetchProxies('America')
	#loadProxies('proxy2.txt')
	timer.stop()





开源中国-程序员在线工具:Git代码托管 API文档大全(120+) JS在线编辑演示 二维码 更多»

发表评论 回到顶部 网友评论(1)

  • 1楼:yuris_115 发表于 2014-08-01 21:45 回复此评论
    为什么不用socket呢?
开源从代码分享开始 分享代码
pengyouya123的其它代码 全部(6)...