Python 抓取中国天气网天气数据

众所周知中国天气网提供的有个公共天气预报API接口，但是不知道咋的一直停在3月4号不更新了。最近做个天

气方面的APP需要用到一些天气数据，360的接口公司不让用。只好自己写一个python脚本放数据中心。

先发三个尚还能用的接口，据说万年历有未来7天预报的api。

http://weather.com.cn/data/zs/101280601.html 各种各样的指数

http://weather.com.cn/data/cityinfo/101280601.html 简洁天气信息

http://weather.com.cn/data/sk/101280601.html 实况天气

获取未来15天温度

#coding=utf-8

import urllib,re,MySQLdb,ConfigParser,datetime,time

y=time.strftime("%Y")
m=time.strftime("%m")
d=time.strftime("%d")

now=y+'-'+m+'-'+d
todaylow=99;
todayhigh=99;

print now

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getRel(html,reg):
    mre = re.compile(reg)
    relList = re.findall(mre,html)
    return relList

html = getHtml("http://weather.com.cn/weather/101280601.shtml")
reg1 = r'<p >s<span>(.+)</span><i>°C</i>'  #temputer 
reg2 = r'</span>s</em>s<i>(.+)</i>'     #wind
reg3 = r'<p >(.+)</p>s<p >'     #index   
reg4 = r'<section >s.+s.+s.+7d1"><b>(.+)</b>'        #feeling
reg5 = r'<section >s.+s.+s.+7d1"><b>.+</b>(.+)</aside>'       #wearing suggest

temputer=getRel(html,reg1)
todayhigh=temputer[0]
todaylow=temputer[1]
i=0
j=0
while i<len(temputer):
    temputer[i]=temputer[i]+"℃~"+temputer[i+1]+"℃"
    del temputer[i+1]
    i=i+1
    j=j+1

wind=getRel(html,reg2)
index=getRel(html,reg3)
feeling=getRel(html,reg4)
suggest=getRel(html,reg5)

mlist=temputer+index+wind+feeling+suggest

print mlist

_mlist = ['temp1','temp2','temp3','temp4','temp5','temp6','temp7','weather1','weather2','weather3','weather4','weather5','weather6','weather7','wind1','wind2','wind3','wind4','wind5','wind6','wind7','index','index48_d']

f=open(r'Weather.html','w')

f.write('{"weatherinfo":{"city":"深圳","city_en":"shenzhen","date_y":"'+y+'年'+m+'月'+d+'日","week":"星期五",')
	   
	   
len=len(_mlist)
for i in range(0,len-1):
    f.write('"'+_mlist[i]+'":"'+mlist[i]+'",')
	
	
f.write('"'+_mlist[len-1]+'":"'+mlist[len-1]+'"'+'}}')


f.close()



try:
    conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='weather',charset='utf8')
    cur=conn.cursor()
    val=[now,todaylow,todayhigh,'-1']
    print val
    cur.execute('insert into record values(%s,%s,%s,%s)',val)
    conn.commit()
    cur.close()
    conn.close()
except MySQLdb.Error,e:
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

	
print 'Update and inser Database Success'
#raw_input()

获取实时空气值（从Pm.in抓取）

#coding=utf-8
import urllib,re,MySQLdb,time,datetime

y=time.strftime("%Y")
m=time.strftime("%m")
d=time.strftime("%d")

now=y+'-'+m+'-'+d
pm='-1'
print now

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getRel(html,reg):
    mre = re.compile(reg)
    relList = re.findall(mre,html)
    return relList


html = getHtml("http://www.pm25.in/shenzhen")
reg1 = r'<div class.*="value">s+([d|.]+)s+</div>'#air value
reg2 = r'<div >s+<h4>s+(.+)s+</h4>'   #index
reg3 = r'<p>建议采取的措施：s+(.+)s+</p>'           #excress suggest


val=getRel(html,reg1)
index=getRel(html,reg2)
suggest=getRel(html,reg3)

mlist=val+index+suggest
pm=str(mlist[1])
print mlist

_mlist = ['aqi','pm2_5_24h','PM10/1h','CO/1h','NO2/1h','O3/1h','O3/8h','SO2/1h','quality','suggest']
print _mlist

f=open(r'AirCondition.html','w')
f.write('[{')

len=len(mlist)
for i in range(0,len-2):
    f.write('"'+_mlist[i]+'":'+mlist[i]+',')
    
f.write('"'+_mlist[len-2]+'":"'+mlist[len-2]+'",')
f.write('"'+_mlist[len-1]+'":"'+mlist[len-1]+'"')
f.write('}]')
f.close()

print now,pm	
try:
    conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='weather',charset='utf8')
    cur=conn.cursor()
    sql = "UPDATE record SET pm2_5 = %s WHERE date = '%s'" % (str(pm),str(now))
    print sql
    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()
except MySQLdb.Error,e:
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])


print 'Air update and inser Database Success'
#raw_input()

然后写个线程个半个小时更新下，最后用SimpleHttpServer运行就能访问了

# -*- coding: utf-8 -*-  
import SimpleHTTPServer
import SocketServer

PORT = 80

Handler = SimpleHTTPServer.SimpleHTTPRequestHandler

httpd = SocketServer.TCPServer(("", PORT), Handler)

print "serving at port", PORT
httpd.serve_forever()

PS：运行两天后发现中国天气网有个大坑，见下图，那就是晚上抓取的数据木有白天的温度，也就是最高温度。

果断转移阵地,从2345拿数据。

#coding=utf-8

import urllib,re,ConfigParser,datetime,time


def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getRel(html,reg):
    mre = re.compile(reg)
    relList = re.findall(mre,html)
    return relList

html = getHtml("http://tianqi.2345.com/shenzhen/59493.htm")
reg1 = r'<font >(-?w+).{0,2}</font>'  #temputer 
reg2 = r'</span>s</em>s<i>(.+)</i>'     #wind
reg3 = r'<p >(.+)</p>s<p >'     #index   
reg4 = r'<section >s.+s.+s.+7d1"><b>(.+)</b>'        #feeling
reg5 = r'<section >s.+s.+s.+7d1"><b>.+</b>(.+)</aside>'       #wearing suggest

temputer=getRel(html,reg1)

print temputer