• 通过urllib2获取网页内容
import urllib2
response = urllib2.urlopen('http://www.baidu.com')
html = response.read()
print html
####
import urllib2
req = urllib2.Request('http://www.baidu.com')
response = urllib2.urlopen(req)
the_page = response.read()
print the_page
  • 发送POST请求
import urllib
import urllib2
url = 'http://www.someserver.com/register.cgi'
values = {'name' : 'WHY',
          'location' : 'SDU',
          'language' : 'Python' }
data = urllib.urlencode(values) # 编码工作
req = urllib2.Request(url, data)  # 发送请求同时传data表单
response = urllib2.urlopen(req)  #接受反馈的信息
the_page = response.read()  #读取反馈的内容
  • 发送GET请求
import urllib2
import urllib

data = {}

data['name'] = 'WHY'
data['location'] = 'SDU'
data['language'] = 'Python'

url_values = urllib.urlencode(data)
print url_values

name=Somebody+Here&language=Python&location=Northampton
url = 'http://www.example.com/example.cgi'
full_url = url + '?' + url_values
data = urllib2.open(full_url)
  • 设置UA,python的默认UA是python-urllib/x.y
import urllib
import urllib2

url = 'http://www.someserver.com/cgi-bin/register.cgi'

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' : 'WHY',
          'location' : 'SDU',
          'language' : 'Python' }

headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
the_page = response.read()
  • URL ERROR
import urllib2
req = urllib2.Request('http://www.baibai.com')
try: urllib2.urlopen(req)
except urllib2.URLError, e:
    print e.reason
  • HTTP ERROR
import urllib2
req = urllib2.Request('http://lightless.me/a.php')
try:
    urllib2.urlopen(req)
except Exception, e:
    print e.code
  • 获取真实的URL
import urllib2

old_url = 'http://rrurl.cn/b1UZuP'
req = Request(old_url)
response = urlopen(req)
print 'Old url: ' + old_url
print 'Real url: ' + response.geturl()
  • 获取Header信息
import urllib2

old_url = 'http://www.baidu.com'
req = Request(old_url)
response = urlopen(req)
print response.info()
  • Proxy的设置,默认情况下会使用环境变量http_proxy来设置代理。
import urllib2

enable_proxy = True
proxy_handler = urllib2.ProxyHandler({"http" : 'http://some-proxy.com:8080'})
null_proxy_handler = urllib2.ProxyHandler({})
if enable_proxy:
    opener = urllib2.build_opener(proxy_handler)
else:
    opener = urllib2.build_opener(null_proxy_handler)
urllib2.install_opener(opener)
  • Timeout设置
# version < 2.6
import urllib2
import socket
socket.setdefaulttimeout(10)
urllib2.socket.setdefaulttimeout(10)

# version >= 2.6
import urllib2
response = urllib2.urlopen('http://www.google.com', timeout=10)
  • 在HTTP Requset中增加指定的Header
import urllib2
request = urllib2.Request('http://www.baidu.com/')
request.add_header('User-Agent', 'fake-client')
response = urllib2.urlopen(request)
print response.read()
  • 打开Debug log
import urllib2
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.google.com')