jsoup 获取数据出错。换了代理ip也不行

耒耒耒耒耒 发布于 2017/08/17 14:22
阅读 322
收藏 0

如题。jsoup获取数据出错。用了代理去访问也不行。
先贴爬取页面的代码

@Slf4j
@Component
public class SpiderUtil {
 
    @Resource
    private DynamicIpUtil dynamicIpUtil;
 
    /**
     * 根据url爬取页面信息
     *
     * @param url url
     * @return 页面信息
     */
    public Document spiderDocument(String url) {
        Document pageDoc = null;
        try {
            Connection con= Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)")
                    .timeout(5000);
            /*.ignoreHttpErrors(true)
            .followRedirects(true)*/
            Connection.Response resp = con.execute();
            if (resp.statusCode() == 200){
                pageDoc = con.get();
            } else {
                log.error("http status error");
                dynamicIpUtil.changeMyIp();
                spiderDocument(url);
            }
            if(pageDoc == null || pageDoc.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
                log.error("ip被拦截 无内容");
                dynamicIpUtil.changeMyIp();
                spiderDocument(url);
            }
 
        } catch (Exception e) {
            log.error("ip被拦截 异常: {}", e);
            dynamicIpUtil.getMyIpInfo();
            dynamicIpUtil.changeMyIp();
            spiderDocument(url);
        }
        if (ipDefensed(url, pageDoc)) {
            // 如果被ip限制了,更换动态ip
            dynamicIpUtil.changeMyIp();
            spiderDocument(url);
        }
        return pageDoc;
    }
 
    /**
     * 判断ip是否被封
     *
     * @param pageDoc 页面信息
     * @return ip
     */
    private boolean ipDefensed(String url, Document pageDoc) {
        boolean ipDefensed = false;
        if (url.contains("anjuke.com")) {
            ipDefensed = AJKIpDefense(pageDoc);
        }
        return ipDefensed;
    }
 
 
    /**
     * 安居客判断ip是否被封
     *
     * @param pageDoc 页面信息
     */
    private boolean AJKIpDefense(Document pageDoc) {
        log.error("ip 被拦截 安居客");
        boolean ajkppDefensed = false;
        String title = pageDoc.title();
        if (title.equals("访问验证-安居客")) {
            ajkppDefensed = true;
        }
        return ajkppDefensed;
    }
}

 

再贴换动态ip的代码

@Slf4j
@Component
public class DynamicIpUtil {
 
    private static List<String[]> ipAndPorts = new ArrayList<String[]>();
 
    private static Integer ipPageNum = 1;
 
    /**
     * 更换动态ip
     */
    public void changeMyIp() {
        String [] ipAndPort = getDynamicIpAndPort();
        String ip = ipAndPort[0];
        String port = ipAndPort[1];
        System.setProperty("http.maxRedirects", "50");
        System.setProperty("https.maxRedirects", "50");
        System.getProperties().setProperty("proxySet", "true");
        System.getProperties().setProperty("http.proxyHost", ip);
        System.getProperties().setProperty("http.proxyPort", port);
        System.getProperties().setProperty("https.proxyHost", ip);
        System.getProperties().setProperty("https.proxyPort", port);
    }
 
    /**
     * 获取ip信息
     */
    public void getMyIpInfo(){
        try {
            Document ipDoc = Jsoup.connect("http://www.ip.cn")
                    .userAgent("Mozilla")
                    .timeout(3000)
                    .get();
            if(ipDoc != null){
                String ipInfo = ipDoc.select(".well").first().text();
                log.info("更换ip 成功: {}", ipInfo);
            }
        } catch (Exception e) {
            log.info("暂不能获取ip 信息");
        }
    }
 
    /**
     * 获取动态ip
     *
     * @return 动态ip
     */
    private String[] getDynamicIpAndPort() {
        String[] ipAndPort = null;
        if (ipAndPorts != null && ipAndPorts.size() > 0) {
            ipAndPort = ipAndPorts.get(0);
            ipAndPorts.remove(0);
        } else {
            try {
                Document pageDoc = Jsoup.connect("http://www.xicidaili.com/wn/" + ipPageNum)
                        .userAgent("Mozilla")
                        .timeout(5000)
                        .get();
                Elements elements = pageDoc.select("tr.odd");
                ipPageNum ++;
                if(ipPageNum > 400){
                    ipPageNum = 1;
                }
                for(Element element : elements){
                    String[] ipPort = new String[2];
                    String ip = element.child(1).text();
                    String port = element.child(2).text();
                    String noName = element.child(4).text();
//                    if(!noName.equals("高匿")){
//                        continue;
//                    }
                    String speedStr = element.child(6).select(".bar").first().attr("title");
                    double speed = Double.valueOf(speedStr.substring(0, speedStr.indexOf("秒")));
                    String timeStr = element.child(7).select(".bar").first().attr("title");
                    double time = Double.valueOf(timeStr.substring(0, timeStr.indexOf("秒")));
                    if(speed <= 1 && time <= 1){
                        ipPort[0] = ip;
                        ipPort[1] = port;
                        ipAndPorts.add(ipPort);
                    }
                }
                return getDynamicIpAndPort();
            } catch (IOException e) {
                log.error("get DynamicIpError error info :\n {}", e);
            }
        }
        return ipAndPort;
    }
}

 

 

如上。在获取这个网页上的数据的时候会出现问题 https://cd.zu.anjuke.com/fangyuan/p1/

具体的错误有几种。
java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(SocketInputStream.java:150)
at java.net.SocketInputStream.read(SocketInputStream.java:121)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:703)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)



org.jsoup.HttpStatusException: HTTP error fetching URL
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:590)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)


java.io.IOException: Unable to tunnel through proxy. Proxy returns "HTTP/1.1 503 Too many open connections"
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2084)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)


java.net.SocketException: Unexpected end of file from server
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:790)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:647)
at sun.net.www.protocol.http.HttpURLConnection.doTunneling(HttpURLConnection.java:2000)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:183)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:153)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:563)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:587)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:540)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:227)


请各位高手帮我看下。谢谢了。在线等。急急急急急急!!!!!

加载中
0
徐大大啊
徐大大啊

记得以前用安居客找房子,发现他们很多数据信息都是爬其他网站的,你们爬安居客的数据,哈哈哈,,

你把host给设置上试试呢

0
徐大大啊
徐大大啊

引用来自“徐大大啊”的评论

记得以前用安居客找房子,发现他们很多数据信息都是爬其他网站的,你们爬安居客的数据,哈哈哈,,

你把host给设置上试试呢

最好把请求头的信息都给模拟上,这样就没问题了

耒耒耒耒耒
耒耒耒耒耒
好的呢。谢谢
0
依然菜刀
依然菜刀

可以试试java提供的另一种代理: Java.net.Proxy

返回顶部
顶部