Python3 Scrap Leaning Notes

Python3 Scrap Leaning Notes

A simple example to download a html document:

<code class="python">import urllib.request
import urllib.error
def download(url,num_retries=2):
    print('Downloading:',url)
    try:
        html=urllib.request.urlopen(url).read()
    except urllib.error.URLError as e:
        print('Download Error',e.reason)
        html=None
        if num_retries>0:
            if hasattr('e','code') and 500<=e.code<600:
                return download(url,num_retries-1)
    return html

Python3 uses urllib.request and lurlib.error to replace urllib2 in python2. So need import this two.

If downloading a html document code with utf-8(chinese website,e.g.), need to decode.

<code class="python">htmlDoc=download('http://www.baidu.com')
htmlDoc1=htmlDoc.decode(encoding='utf-8') #type(htmlDOc)==bytes & type(htmlDoc1)==str
file=open('thisHtml.html','w') #save this html
file.write(htmlDoc1)
file.close()

About hasattr(object,name)

judge a object wheathe has ‘name’ Funtions, return BOOL value

<code class="python">class test():
    name='xiaohui'
    def run(self):
        return name
>>>t=test()
>>>hasattr(t,'name')
True
>>>hasattr(t,'run')
True
>>>hasattr(t,'age')
False

todo:

  • [x] set user agent
  • [ ] limit the download speed avoid be baned
    Q:how to download a image?

3 ways to scrap html documents

  • re:regular expression(fast but difficult)
  • BeautifulSoup(ez but slow,a pure python packaeg)
  • Lxml(fast and ez)
    using lxml%cssselect
    a#home:select id=’home’
    a.home:select class=’home’
<code class="python">import urllib.request
import urllib.error
import lxml.html
def download(url,user_agent='wswp',num_retries=2):
    print('Downloading:',url)
    headers={'User-agent':user_agent}
    request=urllib.request.Request(url,headers=headers)
    try:
        html=urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download Error',e.reason)
        html=None
        if num_retries>0:
            if hasattr('e','code') and 500<=e.code<600:
                return download(url,num_retries-1)
    return html.decode('utf-8')
def select(html,css):
    tree=lxml.html.fromstring(html)
    td=tree.cssselect(css)[0]
    text=td.text_content()
    return text
<code class="python">crawl_queue=[seed_url]
while crawl_queue:
    url=crawl_queue.pop()
    html=download(url)
for link in get_links(html):
    link=urllib.parse.urljoin(seed_url,link) #trans links
    crawl_queue.append(link)
def get_links(html):
    #return all links in this html

Built a HFUT notice push sys.

todo about HFUT notice push

  • [x] timer run
  • [x] css:path error

Because of mail system will delete header directly, so althouth I hava fixed path error, webpage can not display correctly.
Notices come from different website, so there is not a common way to extract the main text.

todo

  • [ ] fix above problem
<code class="python">from test import download,select,selectText,selectCss
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
import urllib.parse
def write_data():
    index=download('http://news.hfut.edu.cn/list-2-1.html')
    recentNewsUrl=select(index,'ul.content.list.pushlist.lh30>li>a')
    file=open('news.txt','w')
    file.write(recentNewsUrl)
    file.close()
def scan():
    file=open('news.txt','r')
    url=file.read()
    file.close()
    firstPage='http://news.hfut.edu.cn/list-2-1.html'
    index=download(firstPage)
    recentNewsUrl=select(index,'ul.content.list.pushlist.lh30>li>a')
    recentNewsUrl=urllib.parse.urljoin(firstPage,recentNewsUrl)
    if recentNewsUrl!=url:
        sendmail(recentNewsUrl)
        f=open('news.txt','w')
        f.write(recentNewsUrl)
        f.close()
def sendmail(url):
    oldHtml=download(url)
    pureHtml=selectText(oldHtml,'div#artibody.content.f16')
    html=path(url,oldHtml)
    title=selectText(html,'title')
    fromAddr='peacefullion@sina.com'
    pw='lion1998'
    toAddr='784081811@qq.com'
    smtpServer='smtp.sina.com'
    msg = MIMEText(pureHtml, 'html', 'utf-8')
    msg['From'] = _format_addr('PeacefulLion Server <%s>' % fromAddr)
    msg['To'] = toAddr#_format_addr('PeacefulLion <%s>' % toAddr)
    msg['Subject'] = Header(title, 'utf-8').encode()
    server = smtplib.SMTP(smtpServer, 25)
    server.set_debuglevel(1)
    server.login(fromAddr, pw)
    server.sendmail(fromAddr, toAddr, msg.as_string())
    server.quit()
    print('ok')
def path(url,html):
    cssList=selectCss(html,'head>link','href')
    jsList=selectCss(html,'head>script','src')
    for css in cssList:
        if css is not None:
            html=html.replace(css,urllib.parse.urljoin(url, css))
    for js in jsList:
        if js is not None:
            html=html.replace(js,urllib.parse.urljoin(url,js))
    return html


def _format_addr(s):
    name, addr = parseaddr(s)
    return formataddr((Header(name, 'utf-8').encode(), addr))
<code class="python">import time
from hfutNews import scan
def run(timeSpan=1200):
    while True:
        scan()
        time.sleep(timeSpan)