01_douban.py
------------------------------
import requests
import json
#1.url,
start_url_temp_list = [
{
"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=ios&for_mobile=1&start={}&count=18",
"country":"US"
},
{
"url_temp":"https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=ios&for_mobile=1&start={}&count=18",
"country":"CN"
}
]
headers = {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"}
def parse_url(url): #发送请求,获取响应的方法
print("现在正在请求:",url)
r = requests.get(url,headers=headers)
return r.content.decode()
def get_content_list(json_response): #3.提取数据的方法
dict_response = json.loads(json_response)
content_list = dict_response["subject_collection_items"]
total = dict_response["total"]
return content_list,total
def save_content_list(content_list): #保存content_list的方法
f = open("douban.txt", "a",encoding="utf-8") #每次调用这个方法只打开了一次文件,关闭了一次稳健
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
f.close()
print("保存成功")
def run():#主要逻辑的实现
# 1.url,
for url_temp in start_url_temp_list:
num = 0
total = 100
while num<=total+18: #假设最后还有10条数据没有取
url = url_temp["url_temp"].format(num)
# 2.发送请求获取响应
json_response = parse_url(url)
#3.提取数据
content_list,total = get_content_list(json_response)
for content in content_list:#添加国家信息
content["country"] = url_temp["country"]
#4.保存
save_content_list(content_list)
num = num +18
if __name__ == '__main__':
run()
------------------------------
01_try_requests.py
------------------------------
# coding=utf-8
import requests
url = "http://www.baidu.com"
# url1 = "www.baidu.com" #当前这个url地址缺少协议
r = requests.get(url)
print(r)
#手动的指定编码方式
# r.encoding = "utf-8"
# print(r.text)
#另一个种方式获取网页源码
# print(type(r.content))
print(r.content.decode())
------------------------------
02_baidu_tupian.py
------------------------------
# coding=utf-8
import requests
r = requests.get("https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/logo_white.png")
print(r.status_code)
print(r.headers)
print("*"*10)
print(r.request.headers)
#保存图片或者视屏到本地的时候,需要保存二进制的数据 #r.content
f = open("baidu.png","wb")
f.write(r.content)
f.close()
------------------------------
03_headers.py
------------------------------
# coding=utf-8
import requests
#顶一个headers
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
url = "http://www.baidu.com"
r = requests.get(url,headers=headers)
print(r.content.decode())
------------------------------
04_sina.py
------------------------------
# coding=utf-8
import requests
url ="http://www.sina.com"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
r = requests.get(url,headers=headers)
# print(r.text)
print(r.content.decode())
------------------------------
05_baidufanyi.py
------------------------------
import requests
import json
import random
import hashlib
# 1.准备好url,post data
post_url = "http://api.fanyi.baidu.com/api/trans/vip/translate"
query_string = input("请输入要翻译的内容:")
appid = "20180917000208070"
appsecret = "BfD6HMJ9_izyRBA61glS"
salt = random.randint(1, 1)
sign = appid+query_string+str(salt)+appsecret
m1 = hashlib.md5()
m1.update(sign.encode(encoding='UTF-8'))
sign = m1.hexdigest()
print(sign)
myurl = post_url+'?appid='+appid+'&q='+query_string+'&from=en'+'&to=zh'+'&salt='+str(salt)+'&sign='+sign
# 2.发送请求,获得数据
response = requests.get(myurl)
json_resposne = response.content.decode() #获取网页html字符串
print(json_resposne)
# 3.提取数据
dict_response = json.loads(json_resposne) #把字符串转化为字典
ret = dict_response["trans_result"][0]["dst"]
print("{}的翻译结果是:{}".format(query_string,ret))
------------------------------
06_session.py
------------------------------
# coding=utf-8
import requests
post_url = "http://www.renren.com/PLogin.do"
post_data = {"email":"mr_mao_hacker@163.com", "password":"alarmchime"}
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
session = requests.session() #实例化一个seesion
session.post(post_url,data=post_data,headers=headers) #使用session发送post请求,并且把登陆后的cookies保存在session里面
#继续使用前一次带有cookies的session请求必须登录才能访问的页面
response = session.get("http://www.renren.com/327550029/profile",headers=headers)
f = open("renren.html","w",encoding="utf-8")
f.write(response.content.decode())
f.close()
------------------------------
07_cookies.py
------------------------------
import requests
post_url = "http://www.renren.com/PLogin.do"
post_data = {"email":"mr_mao_hacker@163.com", "password":"alarmchime"}
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Cookie":"anonymid=j3jxk555-nrn0wh; _r01_=1; JSESSIONID=abcX04wUgB28nc3au698v; depovince=BJ; _uij=JTdCJTIydXNlcklkJTIyJTNBMzI3NTUwMDI5JTJDJTIydXNlck5hbWUlMjIlM0ElMjIlRTYlQUYlOUIlRTUlODUlODYlRTUlODYlOUIlMjIlMkMlMjJoZWFkRnVsbFVybCUyMiUzQSUyMmh0dHAlM0ElMkYlMkZoZG4ueG5pbWcuY24lMkZwaG90b3MlMkZoZG4zMjElMkYyMDE3MTAxMCUyRjE2MTUlMkZoZWFkX01uUFBfYzk3NTAwMDA4NGZlMTk4Ni5qcGclMjIlMkMlMjJnZW5kZXIlMjIlM0ElMjIlRTclOTQlQjclRTclOTQlOUYlMjIlMkMlMjJsb2dDb3VudCUyMiUzQTIzMyUyQyUyMmNvZGUlMjIlM0EwJTdE; renrenuid=327550029; _ga=GA1.2.1274811859.1497951251; _gid=GA1.2.425390350.1509330131; ch_id=10016; jebecookies=355a2bd2-ff48-4ca7-849d-0cfa70d01373|||||; ick_login=bb62b795-7cc8-4fed-851e-bdb8552a0f2f; _de=BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5; p=456b02be83f0342ca2f81410a5feb0fa9; first_login_flag=1; ln_uact=mr_mao_hacker@163.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn321/20171010/1615/main_MnPP_c975000084fe1986.jpg; t=59278a421bf091324fd085192873a61c9; societyguester=59278a421bf091324fd085192873a61c9; id=327550029; xnsid=929e661c; loginfrom=syshome; wp_fold=0"}
#继续使用前一次带有cookies的session请求必须登录才能访问的页面
response = requests.get("http://www.renren.com/327550029/profile",headers=headers)
f = open("renren.html","w",encoding="utf-8")
f.write(response.content.decode())
f.close()
------------------------------
parse.py
------------------------------
import requests
from retrying import retry
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
@retry(stop_max_attempt_number=3)
def _parse_url(url): #专门的发送请求
print("*"*50)
response = requests.get(url,headers=headers,timeout=3)
assert response.status_code == 200 #assert断言,断言失败会报错
return response.content.decode()
def parse_url(url):
try: #异常捕获
html_str = _parse_url(url) #调用之前的_parse_url的方法
except:
html_str = None #如果异常了,就让html_str=None
return html_str
if __name__ == '__main__':
url = "http://www.baidu.com"
url1 = "www.baidu.com"
html_str = parse_url(url1)
if html_str is not None:
print(html_str[:500])
else:
print("error")
------------------------------
请同学们参考练习,务必亲自敲代码,脚踏实地的编程!
发表评论:
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。