import mechanize import cookielib # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages"htmlcode"># Open some site, let's pick a random one, the first that pops in mind: r = br.open('http://google.com') html = r.read() # Show the source print html # or print br.response().read() # Show the html title print br.title() # Show the response headers print r.info() # or print br.response().info() # Show the available forms for f in br.forms(): print f # Select the first (index zero) form br.select_form(nr=0) # Let's search br.form['q']='weekend codes' br.submit() print br.response().read() # Looking at some results in link format for l in br.links(url_regex='stockrt'): print l如果你访问的网站需要验证(http basic auth),那么:
# If the protected site didn't receive the authentication data you would # end up with a 410 error in your face br.add_password('http://safe-site.domain', 'username', 'password') br.open('http://safe-site.domain')由于之前使用了Cookie Jar,你不需要管理网站的登录session。也就是不需要管理需要POST一个用户名和密码的情况。
通常这种情况,网站会请求你的浏览器去存储一个session cookie除非你重复登陆,
而导致你的cookie中含有这个字段。所有这些事情,存储和重发这个session cookie已经被Cookie Jar搞定了,爽吧。
# Testing presence of link (if the link is not found you would have to # handle a LinkNotFoundError exception) br.find_link(text='Weekend codes') # Actually clicking the link req = br.click_link(text='Weekend codes') br.open(req) print br.response().read() print br.geturl() # Back br.back() print br.response().read() print br.geturl()下载一个文件:
# Download f = br.retrieve('http://www.google.com.br/intl/pt-BR_br/images/logo.gif')[0] print f fh = open(f)为http设置代理
# Proxy and user/password br.set_proxies({"http": "joe:password@myproxy.example.com:3128"}) # Proxy br.set_proxies({"http": "myproxy.example.com:3128"}) # Proxy password br.add_proxy_password("joe", "password")但是,如果你只想要打开网页,而不需要之前所有神奇的功能,那你可以:
# Simple open"_blank" href="http://wwwsearch.sourceforge.net/mechanize">mechanize官方网站 , mechanize文档 和ClientForm的文档 了解更多。
#! /usr/bin/env python #coding=utf-8 import urllib2 import urllib import cookielib data={"email":"用户名","password":"密码"} #登陆用户名和密码 post_data=urllib.urlencode(data) cj=cookielib.CookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) headers ={"User-agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} req=urllib2.Request("http://www.renren.com/PLogin.do",post_data,headers) content=opener.open(req) print content.read().decode("utf-8").encode("gbk")具体请参考:
http://www.crazyant.net/796.html Python使用cookielib和urllib2模拟登陆新浪微博并抓取数据
http://my.oschina.net/duhaizhang/blog/69342 urllib2模块
https://docs.python.org/2/library/cookielib.html cookielib — Cookie handling for HTTP clients