Python3 Scrap Leaning Notes
Python3 Scrap Leaning Notes
A simple example to download a html document:
<code class="python">import urllib.request
import urllib.error
def download(url,num_retries=2):
print('Downloading:',url)
try:
html=urllib.request.urlopen(url).read()
except urllib.error.URLError as e:
print('Download Error',e.reason)
html=None
if num_retries>0:
if hasattr('e','code') and 500<=e.code<600:
return download(url,num_retries-1)
return html
Python3 uses urllib.request and lurlib.error to replace urllib2 in python2. So need import this two.
If downloading a html document code with utf-8(chinese website,e.g.), need to decode.
<code class="python">htmlDoc=download('http://www.baidu.com')
htmlDoc1=htmlDoc.decode(encoding='utf-8') #type(htmlDOc)==bytes & type(htmlDoc1)==str
file=open('thisHtml.html','w') #save this html
file.write(htmlDoc1)
file.close()
About hasattr(object,name)
judge a object wheathe has ‘name’ Funtions, return BOOL value
<code class="python">class test():
name='xiaohui'
def run(self):
return name
>>>t=test()
>>>hasattr(t,'name')
True
>>>hasattr(t,'run')
True
>>>hasattr(t,'age')
False
todo:
- [x] set user agent
- [ ] limit the download speed avoid be baned
Q:how to download a image?
3 ways to scrap html documents
- re:regular expression(fast but difficult)
- BeautifulSoup(ez but slow,a pure python packaeg)
- Lxml(fast and ez)
using lxml%cssselect
a#home:select id=’home’
a.home:select class=’home’
<code class="python">import urllib.request
import urllib.error
import lxml.html
def download(url,user_agent='wswp',num_retries=2):
print('Downloading:',url)
headers={'User-agent':user_agent}
request=urllib.request.Request(url,headers=headers)
try:
html=urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print('Download Error',e.reason)
html=None
if num_retries>0:
if hasattr('e','code') and 500<=e.code<600:
return download(url,num_retries-1)
return html.decode('utf-8')
def select(html,css):
tree=lxml.html.fromstring(html)
td=tree.cssselect(css)[0]
text=td.text_content()
return text
use queue to track links
<code class="python">crawl_queue=[seed_url]
while crawl_queue:
url=crawl_queue.pop()
html=download(url)
for link in get_links(html):
link=urllib.parse.urljoin(seed_url,link) #trans links
crawl_queue.append(link)
def get_links(html):
#return all links in this html
Built a HFUT notice push sys.
todo about HFUT notice push
- [x] timer run
- [x] css:path error
Because of mail system will delete header directly, so althouth I hava fixed path error, webpage can not display correctly.
Notices come from different website, so there is not a common way to extract the main text.
todo
- [ ] fix above problem
<code class="python">from test import download,select,selectText,selectCss
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
import urllib.parse
def write_data():
index=download('http://news.hfut.edu.cn/list-2-1.html')
recentNewsUrl=select(index,'ul.content.list.pushlist.lh30>li>a')
file=open('news.txt','w')
file.write(recentNewsUrl)
file.close()
def scan():
file=open('news.txt','r')
url=file.read()
file.close()
firstPage='http://news.hfut.edu.cn/list-2-1.html'
index=download(firstPage)
recentNewsUrl=select(index,'ul.content.list.pushlist.lh30>li>a')
recentNewsUrl=urllib.parse.urljoin(firstPage,recentNewsUrl)
if recentNewsUrl!=url:
sendmail(recentNewsUrl)
f=open('news.txt','w')
f.write(recentNewsUrl)
f.close()
def sendmail(url):
oldHtml=download(url)
pureHtml=selectText(oldHtml,'div#artibody.content.f16')
html=path(url,oldHtml)
title=selectText(html,'title')
fromAddr='peacefullion@sina.com'
pw='lion1998'
toAddr='784081811@qq.com'
smtpServer='smtp.sina.com'
msg = MIMEText(pureHtml, 'html', 'utf-8')
msg['From'] = _format_addr('PeacefulLion Server <%s>' % fromAddr)
msg['To'] = toAddr#_format_addr('PeacefulLion <%s>' % toAddr)
msg['Subject'] = Header(title, 'utf-8').encode()
server = smtplib.SMTP(smtpServer, 25)
server.set_debuglevel(1)
server.login(fromAddr, pw)
server.sendmail(fromAddr, toAddr, msg.as_string())
server.quit()
print('ok')
def path(url,html):
cssList=selectCss(html,'head>link','href')
jsList=selectCss(html,'head>script','src')
for css in cssList:
if css is not None:
html=html.replace(css,urllib.parse.urljoin(url, css))
for js in jsList:
if js is not None:
html=html.replace(js,urllib.parse.urljoin(url,js))
return html
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))
<code class="python">import time
from hfutNews import scan
def run(timeSpan=1200):
while True:
scan()
time.sleep(timeSpan)