0
回答
python抓取Tripadvisor时,抓取的字段不全,会自动跳过的问题?
# -*- coding: utf-8 -*-


from bs4 import BeautifulSoup
import time
import requests
import re




url = 'http://www.tripadvisor.cn/Restaurants-g298162-Urayasu_Chiba_Prefecture_Kanto.html'


headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}


wb_data1 = requests.get(url)
soup = BeautifulSoup(wb_data1.text,'lxml')


lala = soup.select('div.shortSellDetails > h3 > a ')
laji = []


for ever in lala:
    jieguo = "http://www.tripadvisor.cn"+ever["href"]
    buhao = jieguo.split()
    laji = laji+buhao


for qnmlgb in laji:
    wb_data = requests.get(qnmlgb)
    time.sleep(4)
    soup = BeautifulSoup(wb_data.text,'lxml')


    biaoti = soup.select('#HEADING')
    jutixinxi = soup.select('div > address')
    dianhua = soup.select('div.contact_info > div > div:nth-of-type(1) > div')
    leixing = soup.select('#HEADING_GROUP > div > div.heading_ratings > div:nth-of-type(2) > span > div > a')


    quanbuwenben = soup.get_text().replace('\n','')
    w1 = "CurrentCenter"
    w2 = "signature"
    d = re.compile(w1+'(.*?)'+w2,re.S)
    result = d.findall(quanbuwenben)


    for j,i,k,h,n in zip(biaoti,jutixinxi,dianhua,result,leixing):
        data = {
            '名称':j.get_text().replace('\n',''),
            '地址':i.get_text().replace('\n',''),
            '电话':k.get_text().replace('\n',''),
            '坐标':h.replace('.png|',''),
            '分类':n.get_text().replace('\n',''),
            }
        print(data)

问题出在抓取dianhua这个字段的时候,发现当抓取某个景点页面的信息时,如果没有电话这个字段,这个链接就整个被跳过了。

是不是应该加个判断?就是不知道该怎么写....


举报
夏末雨
发帖于1年前 0回/132阅
顶部