11
回答
python
华为云4核8G,高性能云服务器,免费试用   

python抓取网站图片时,出现如下错误

    raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '//wx3.sinaimg.cn/mw600/95e71c7fgy1fdgj1l9zxuj20j60cs3z5.jpg'

我发现网页图片地址是这种

<a href="//wx4.sinaimg.cn/large/ead7db1fgy1fdpzvrcpzgj20jx3ulk89.jpg" target="_blank" class="view_img_link">[查看原图]</a>

是不是错误和http://有关

举报
DataPrince
发帖于1年前 11回/635阅
共有11个答案 最后回答: 1年前
你的url是//wx3.sinaimg.cn/mw600/...有效的应该是http://开头
--- 共有 2 条评论 ---
uukk 回复 @金樽清酒斗十千 : 找到原因了,找图片地址时,匹配img src=和.jpg时,找到是网页的所有的jpg格式图片,结果找到了一个不符合要下载图片的地址,'http://s.jandan.com/static/gg/lanya.jpg',这是一个广告图片,remove就行 1年前 回复
DataPrince但是网站上的图片URL是//开头,这该怎么改 1年前 回复
import urllib.request
import os
import random

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
    
    response = urllib.request.urlopen(url)
    html = response.read()

    return html

def get_page(url):
    html = url_open(url).decode('utf-8')

    a = html.find('current-comment-page') + 23
    b = html.find(']',a)

    return html[a:b]

def find_imgs(url):
   html = url_open(url).decode('utf-8')
   img_addrs = []

   a = html.find('img src=')

   while a != -1:
       b = html.find('.jpg',a,a+255)
       if b != -1:
           img_addrs.append(html[a+9:b+4])
       else:
            b = a + 9
       
       a = html.find('img src=',b)

   return img_addrs


def save_imgs(folder,img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename,'wb') as f:
            img = url_open(each)
            f.write(img)


def download_mm(folder='test',pages=2000):
    os.mkdir(folder)
    os.chdir(folder)

    url = 'http://jandan.net/ooxx/'
    page_num = int(get_page(url))

    for i in range(pages):
        page_num -= i
        page_url = url + 'page-' + str(page_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_imgs(folder,img_addrs)

if __name__ == '__main__':
    download_mm()
    

 

import urllib.request
import os
import random


def url_open(url):
    print(url)

    req = urllib.request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()

    return html


def get_page(url):
    html = url_open(url).decode('utf-8')

    a = html.find('current-comment-page') + 23
    b = html.find(']', a)

    return html[a:b]


def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_addrs = []

    a = html.find('img src=')

    while a != -1:
        b = html.find('.jpg', a, a + 255)
        if b != -1:
            img_addrs.append(html[a + 9:b + 4])
        else:
            b = a + 9

        a = html.find('img src=', b)

    return img_addrs


def save_imgs(folder, img_addrs):
    for each in img_addrs:
        #改了这里
        each='http:'+each
        #改了这里
        filename = each.split('/')[-1]
        with open(filename, 'wb') as f:
            img = url_open(each)
            f.write(img)


def download_mm(folder='testt', pages=2000):
    os.mkdir(folder)
    os.chdir(folder)

    url = 'http://jandan.net/ooxx/'
    page_num = int(get_page(url))

    for i in range(pages):
        page_num -= i
        page_url = url + 'page-' + str(page_num) + '#comments'
        img_addrs = find_imgs(page_url)
        save_imgs(folder, img_addrs)


if __name__ == '__main__':
    download_mm()
--- 共有 1 条评论 ---
DataPrinceI need your help 1年前 回复

可以了

--- 共有 1 条评论 ---
DataPrince不好意思,你测试了吗,加上后,我怎么出现这种错误 raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 403: Forbidden 1年前 回复
#!/usr/bin/python3
# coding:utf8
import requests
import re
import os
import time
from threading import Thread
class main:
    def __init__(self):
        self.url = 'http://jandan.net/ooxx/page-'
        self.url1=''
        self.n=0
        self.headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/56.0.2924.76 Chrome/56.0.2924.76 Safari/537.36',
        }
        self.req = requests.Session()

    def getpage(self):
        page = self.req.get(self.url1,headers=self.headers)
        pattern = re.compile(u'<div class="text">.*?<p><a href="(.*?)" target="_blank" class="view_img_link">.*?</a><br /><img .*?</div>',
                             re.S)
        imgurllist = list()
        imgurllist = re.findall(pattern, page.text)
        for u in imgurllist:
            try:
                self.n+=1
                w=open('picture/%d.jpg' %self.n,'wb')
                src=self.req.get('http:'+u,headers=self.headers)
                w.write(src.content)
                w.close()
                self.n+=1
                print('当前正在下载%d张图片'%self.n)
            except requests.exceptions.ConnectionError:
                print('连接错误!!')
            except requests.exceptions.ConnectionError:
                print('连接出错!!')

            except requests.exceptions.InvalidURL:
                print('无效的url!!!')
            except requests.exceptions.MissingSchema:
                print('url出错!!')
            except requests.exceptions.InvalidSchema:
                print('无效的模式!!')
        #print([url for u in imgurllist])


if __name__=='__main__':
    if not os.path.exists('picture/'):
        os.mkdir('picture')

    m=main()
    T=[]
    for i in range(2405):
       m.url1=m.url+str(i)+'#comments'
       t=Thread(target=m.getpage)
       t.start()
       time.sleep(0.5)
       T.append(t)


    for tt in T:
        tt.join()

 

--- 共有 1 条评论 ---
mickelfeng2405页,就开2405个线程 1年前 回复

额,你可以参考下我的这个

--- 共有 8 条评论 ---
DataPrince@喝酒不抽烟 回复@喝酒不抽烟 : 状态码是403,我查了下,是服务器已经理解请求,但是拒绝执行它。 1年前 回复
南寻 回复 @金樽清酒斗十千 : 可能吧,,, 1年前 回复
DataPrince@喝酒不抽烟 回复@喝酒不抽烟 : 如果是网站拦截问题的话会出现这种错误吗 403 1年前 回复
DataPrince@喝酒不抽烟 回复@喝酒不抽烟 : 我觉得也是,他们网站服务器把我拦截了应该 1年前 回复
南寻 回复 @金樽清酒斗十千 : 我觉得是网站的问题 1年前 回复
顶部