日期:2014-05-17 浏览次数:20993 次
#-*- coding=utf-8 -*-
import time,urllib2,urllib,re,HTMLParser,os
from htmlentitydefs import entitydefs
class PageParser(HTMLParser.HTMLParser):#翻译实体
def __init__(self):
self.data=""
self.readcode=0
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag=='textarea':
self.readcode=1
def handle_data(self,data):
if self.readcode:
self.data+=data
def handle_endtag(self,tag):
if tag=='textarea':
self.readcode=0
def handle_entityref(self,name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def getdata(self):
return self.data
global res
def getACUrl():
step=1
r=re.compile(unicode("下一页","utf8"))
#.{500}?\"
r2=re.compile("<input type=\"hidden\" name=\"__VIEWSTATE\" id=\"__VIEWSTATE\" value=\".{1,50000}?\" />")
url = "http://algorithm.fzu.edu.cn/OnlineJudgeUserStatus.aspx"
parms = {
'__EVENTTARGET':'ctl00$MainRightHolder$UserStatusGridView',
'ctl00$MainRightHolder$UserIdTextBox':'120320050',
}
login1=urllib2.urlopen(url)
pagedata=login1.read()
s1=r2.findall(pagedata)
if len(s1)==0:
return
parms['__VIEWSTATE']=s1[0][64:-4]#form表单的提取
while True:
try:
if step==1:
parms['__EVENTARGUMENT']='Page$First'
else:
parms['__EVENTARGUMENT']='Page$Next'
step+=1
login = urllib2.urlopen(url,urllib.urlencode(parms))
data=(unicode(login.read(),"utf8"))
# fout=open("c:\\1.html","wb+")
# fout.write(data.encode("GBK"))
# fout.close()
findurl(data)
if len(r.findall(data))==0:
break
s1=r2.findall(data)
if len(s1)==0:
return
parms['__VIEWSTATE']=s1[0][64:-4]
data=""
except Exception,e:
print(e)
break
def Login(username,password):#模拟登陆
try:
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
parms = {
'__VIEWSTATE':r'/wEPDwULLTE2ODk5MTAyOTUPZBYCAgMPZBYCAgUPEA8WAh4LXyFEYXRhQm91bmRnZBAVBgzmnIDmlrDkv6Hmga8kMTLmnIg15pel566X5rOV6K++5YGc5LiK5LiA5qyh77yM6K++EuS8mOengOS9nOS4muWAmemAiSgg5YWz5LqO6aKY55uu55qE5pe26Ze056m66Ze06ZmQ5Yi255qE6ZeuJOWFs+S6jueZu+mZhuezu+e7n+eUqOaIt+WQjeS4juWvhueggQg+PuabtOWkmhUGABdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NRdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NBdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MxdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MhFCdWxsZXRpbkxpc3QuYXNweBQrAwZnZ2dnZ2cWAGQYAgUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFEkJhbm5lciRMb2dpbkJ1dHRvbgUgQmFubmVyJFVzZXJDb250cm9sUGFuZWxNdWx0aVZpZXcPD2RmZOAvQzwaH/EzyqdrNO7IO2UefuMIHdnWhg02m4yXus4K',
'Banner$LoginButton.x':'17',
'Banner$LoginButton.y':'5'
}
parms[r"Banner$UserNameText"]=username
parms[r"Banner$Password"]=password
loginUrl = "http://algorithm.fzu.edu.cn/Default.aspx"
login = urllib2.urlopen(loginUrl,urllib.urlencode(parms))
h=(unicode(login.read(),"utf8"))
# loginer = urllib2.urlopen("http://poj.org/")#登录主页
# print(loginer.read().decode("utf8"))
except Exception,e:
print(e)
def findurl(data):
r=re.compile("<a class=\"underline\" href=\".{1,500}?\" target=\"_blank\">.{1,500}?</a></td><td><a class=\"hover-underline\"