python正则表达式的匹配问题

隔壁的流氓王欧巴 发布于 2016/01/07 15:50
阅读 268
收藏 0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re








line="""tyle="width:100px;text-align:left;padding-left:5px;">亮玉大米</em>
                                        <em style="width:80px;">2.88</em>
                                        <em style="width:80px;">2.90</em>
                                        <em style="width:80px;">2.92</em>
                                        <em style="width:80px;">普通</em>
                                        <em style="width:80px;">斤</em>
                                        <em style="width:106px;">2016-01-01</em>
                                        <em style="width:60px;">&nbsp;</em>"""
pattern = re.compile('<em style="width:80px;">(.*?) </em>',re.S)
items = re.findall(pattern,line)
for item in items:
    print item[0],item[1],item[2],item[3],item[4]


#matchObj = re.search( r"""<em style="width:106px;">(.*)</em>""", line, re.M)




#if matchObj:


 #  print "matchObj.group(1) : ", abc.group()
  # print "matchObj.group(1) : ", matchObj.group(1)
#else:
 #  print "No match!!"

~        




现在这个代码是这样的,各位看被匹配的那个,有好几行都是 <em style="width:80px;"></em>的格式,而且还是每一行都空格之后才开始接下来的一行,我想把所有这种写法的都匹配出来,最后的结果就是依次输出 2.88 2.90 2.92...



还有一个问题是我找的这个练手的网页是新发地的,有好多同样格式的,我想正则匹配完之后是

大白菜 2.88 2.90 2.91 

葵花油 2.88 5.8 5.5

这样的直接能导入sql的格式,网页部分代码如下

                                              

<div class="conDiv" id="tab1_div_0">
<ul>
<li class="conLi">
<em style="width:100px;text-align:left;padding-left:5px;">大白菜</em>
<em style="width:80px;">0.25</em>
<em style="width:80px;">0.32</em>
<em style="width:80px;">0.38</em>
<em style="width:80px;">普通</em>
<em style="width:80px;"></em>
<em style="width:106px;">2016-01-07</em>
<em style="width:60px;">&nbsp;</em>
</li><li class="conLi conLiji">
<em style="width:100px;text-align:left;padding-left:5px;">甘蓝洋白菜</em>
<em style="width:80px;">0.70</em>
<em style="width:80px;">1.02</em>
<em style="width:80px;">1.35</em>
<em style="width:80px;">普通</em>
<em style="width:80px;"></em>
<em style="width:106px;">2016-01-07</em>

<em style="width:60px;">&nbsp;</em>



求各位大神

加载中
0
chengym
chengym
最好有网址..
隔壁的流氓王欧巴
隔壁的流氓王欧巴
www.xinfadi.com.cn
0
chengym
chengym
#!/usr/bin/env python3
# coding=utf-8




'''
'''
import urllib.request as request
import re
import json




def getText(url):
    req = request.Request(url, headers=headers)
    html = request.urlopen(req)
    codes = ['utf-8', 'gbk']
    btext = html.read()
    for c in codes:
        try:
            text = btext.decode('utf-8')
            return text
        except Exception:
            pass




def processPattern(text):
    names = re.findall(name_pattern, text)
    prices = re.findall(price_pattern, text)
    prices = zip(*[iter(prices)]*3)
    for name, price in zip(names, prices):
        yield tuple([name, *price])




def toJson(obj, filename):
    with open(filename+'.json', 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)




def main(url):
    text = getText(url)
    foodLit = list(processPattern(text))
    toJson(foodLit, 'Food')
if __name__ == '__main__':
    url = 'http://www.xinfadi.com.cn/'
    name_pattern = r'<em style="width:100px;text-align:left;padding-left:5px;">(.+?)</em>'
    price_pattern = r'<em style="width:80px;">(\d+?\.?\d*?)</em>'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2537.0 Safari/537.36'}
    main(url)

0
chengym
chengym
#!/usr/bin/env python3
# coding=utf-8




'''
'''
import urllib.request as request
import re
import sqlite3




def getText(url):
    req = request.Request(url, headers=headers)
    html = request.urlopen(req)
    codes = ['utf-8', 'gbk']
    btext = html.read()
    for c in codes:
        try:
            text = btext.decode('utf-8')
            return text
        except Exception:
            pass




def processPattern(text):
    names = re.findall(name_pattern, text)
    prices = re.findall(price_pattern, text)
    prices = zip(*[iter(prices)]*3)
    datas = re.findall(data_pattern, text)
    for name, price, data in zip(names, prices, datas):
        yield tuple([name, *price, data])




def initSql(dbname):
    conn = sqlite3.connect(dbname+'.dat')
    return conn.cursor(), conn




def toSqlite(objs, tb_name):
    c.execute('''CREATE TABLE %s(品名 text, 最低价 text, 平均价 text, 最高价 text, 发布日期 text)'''%(tb_name))
    print(objs)
    c.executemany('INSERT INTO %s VALUES(?, ?, ?, ?, ?)'%(tb_name), objs)
    conn.commit()
    conn.close()




def main(url):
    text = getText(url)
    foodLit = list(processPattern(text))
    toSqlite(foodLit, 'food')
if __name__ == '__main__':
    url = 'http://www.xinfadi.com.cn/'
    name_pattern = r'<em style="width:100px;text-align:left;padding-left:5px;">(.+?)</em>'
    price_pattern = r'<em style="width:80px;">(\d+?\.?\d*?)</em>'
    data_pattern = r'<em style="width:106px;">(\d{4}-\d{2}-\d{2})</em>'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2537.0 Safari/537.36'}
    c, conn = initSql('foods')
    main(url)

隔壁的流氓王欧巴
隔壁的流氓王欧巴
吊炸天。。。
0
neuront
neuront

不要用正则表达式解析 HTML / XML

正则表达式不是用来解析 HTML / XML 的工具

建议楼主去看看这篇回答 http://stackoverflow.com/a/1732454/555515


正确的做法是, 使用 BeautifulSoup http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/ 或者其他什么 XML 解析工具.

neuront
neuront
回复 @隔壁的流氓王欧巴 : 刚才打了一行手滑了就出去了, 已加一个库的链接
隔壁的流氓王欧巴
隔壁的流氓王欧巴
求个正确方法。。。来个教程也是极好的。。。
返回顶部
顶部