Python 抓取中国天气网天气数据
众所周知中国天气网提供的有个公共天气预报API接口,但是不知道咋的一直停在3月4号不更新了。最近做个天
气方面的APP需要用到一些天气数据,360的接口公司不让用。只好自己写一个python脚本放数据中心。
先发三个尚还能用的接口,据说万年历有未来7天预报的api。
http://weather.com.cn/data/zs/101280601.html 各种各样的指数
http://weather.com.cn/data/cityinfo/101280601.html 简洁天气信息
http://weather.com.cn/data/sk/101280601.html 实况天气
获取未来15天温度
#coding=utf-8
import urllib,re,MySQLdb,ConfigParser,datetime,time
y=time.strftime("%Y")
m=time.strftime("%m")
d=time.strftime("%d")
now=y+'-'+m+'-'+d
todaylow=99;
todayhigh=99;
print now
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getRel(html,reg):
mre = re.compile(reg)
relList = re.findall(mre,html)
return relList
html = getHtml("http://weather.com.cn/weather/101280601.shtml")
reg1 = r'<p >s<span>(.+)</span><i>°C</i>' #temputer
reg2 = r'</span>s</em>s<i>(.+)</i>' #wind
reg3 = r'<p >(.+)</p>s<p >' #index
reg4 = r'<section >s.+s.+s.+7d1"><b>(.+)</b>' #feeling
reg5 = r'<section >s.+s.+s.+7d1"><b>.+</b>(.+)</aside>' #wearing suggest
temputer=getRel(html,reg1)
todayhigh=temputer[0]
todaylow=temputer[1]
i=0
j=0
while i<len(temputer):
temputer[i]=temputer[i]+"℃~"+temputer[i+1]+"℃"
del temputer[i+1]
i=i+1
j=j+1
wind=getRel(html,reg2)
index=getRel(html,reg3)
feeling=getRel(html,reg4)
suggest=getRel(html,reg5)
mlist=temputer+index+wind+feeling+suggest
print mlist
_mlist = ['temp1','temp2','temp3','temp4','temp5','temp6','temp7','weather1','weather2','weather3','weather4','weather5','weather6','weather7','wind1','wind2','wind3','wind4','wind5','wind6','wind7','index','index48_d']
f=open(r'Weather.html','w')
f.write('{"weatherinfo":{"city":"深圳","city_en":"shenzhen","date_y":"'+y+'年'+m+'月'+d+'日","week":"星期五",')
len=len(_mlist)
for i in range(0,len-1):
f.write('"'+_mlist[i]+'":"'+mlist[i]+'",')
f.write('"'+_mlist[len-1]+'":"'+mlist[len-1]+'"'+'}}')
f.close()
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='weather',charset='utf8')
cur=conn.cursor()
val=[now,todaylow,todayhigh,'-1']
print val
cur.execute('insert into record values(%s,%s,%s,%s)',val)
conn.commit()
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
print 'Update and inser Database Success'
#raw_input()
获取实时空气值(从Pm.in抓取)
#coding=utf-8
import urllib,re,MySQLdb,time,datetime
y=time.strftime("%Y")
m=time.strftime("%m")
d=time.strftime("%d")
now=y+'-'+m+'-'+d
pm='-1'
print now
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getRel(html,reg):
mre = re.compile(reg)
relList = re.findall(mre,html)
return relList
html = getHtml("http://www.pm25.in/shenzhen")
reg1 = r'<div class.*="value">s+([d|.]+)s+</div>'#air value
reg2 = r'<div >s+<h4>s+(.+)s+</h4>' #index
reg3 = r'<p>建议采取的措施:s+(.+)s+</p>' #excress suggest
val=getRel(html,reg1)
index=getRel(html,reg2)
suggest=getRel(html,reg3)
mlist=val+index+suggest
pm=str(mlist[1])
print mlist
_mlist = ['aqi','pm2_5_24h','PM10/1h','CO/1h','NO2/1h','O3/1h','O3/8h','SO2/1h','quality','suggest']
print _mlist
f=open(r'AirCondition.html','w')
f.write('[{')
len=len(mlist)
for i in range(0,len-2):
f.write('"'+_mlist[i]+'":'+mlist[i]+',')
f.write('"'+_mlist[len-2]+'":"'+mlist[len-2]+'",')
f.write('"'+_mlist[len-1]+'":"'+mlist[len-1]+'"')
f.write('}]')
f.close()
print now,pm
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='weather',charset='utf8')
cur=conn.cursor()
sql = "UPDATE record SET pm2_5 = %s WHERE date = '%s'" % (str(pm),str(now))
print sql
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
print 'Air update and inser Database Success'
#raw_input()
然后写个线程个半个小时更新下,最后用SimpleHttpServer运行就能访问了
# -*- coding: utf-8 -*-
import SimpleHTTPServer
import SocketServer
PORT = 80
Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
httpd = SocketServer.TCPServer(("", PORT), Handler)
print "serving at port", PORT
httpd.serve_forever()
PS:运行两天后发现中国天气网有个大坑,见下图,那就是晚上抓取的数据木有白天的温度,也就是最高温度。
果断转移阵地,从2345拿数据。
#coding=utf-8
import urllib,re,ConfigParser,datetime,time
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getRel(html,reg):
mre = re.compile(reg)
relList = re.findall(mre,html)
return relList
html = getHtml("http://tianqi.2345.com/shenzhen/59493.htm")
reg1 = r'<font >(-?w+).{0,2}</font>' #temputer
reg2 = r'</span>s</em>s<i>(.+)</i>' #wind
reg3 = r'<p >(.+)</p>s<p >' #index
reg4 = r'<section >s.+s.+s.+7d1"><b>(.+)</b>' #feeling
reg5 = r'<section >s.+s.+s.+7d1"><b>.+</b>(.+)</aside>' #wearing suggest
temputer=getRel(html,reg1)
print temputer