python 2.7环境下爬图,初学者请多指教。
import os
import time
import threading
from multiprocessing import Pool, cpu_count
import urllib
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
from bs4 import BeautifulSoup
HEADERS = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Referer': "http://www.umei.cc"
}
url = 'http://www.umei.cc/p/gaoqing/rihan/'
f = open("rihan_umei_url.txt", "w")
for i in range(1, 142):
page_url = url + str(i) + '.htm'
r = requests.get(page_url, headers=HEADERS, timeout=10).text
for sub_page_urls in BeautifulSoup(r, 'lxml').find_all('a', target='_blank'):
sub_page_url = sub_page_urls['href']
print >> f, sub_page_url
f.close()
f = open("E:/python/rihan_umei_url.txt","r")
lines = f.readlines()
for index, line in enumerate(lines):
#line='http://www.umei.cc/p/gaoqing/xiuren_VIP/20160723192129.htm'
tem_line = line.strip('\n')
page_index = filter(str.isdigit, tem_line)
r = requests.get(tem_line, headers=HEADERS, timeout=10).text
for countmax in BeautifulSoup(r, 'lxml').find('div', class_='NewPages').find_all('a',limit=1):
total_page=countmax.get_text().encode("utf-8")
#print total_page
page_num = filter(str.isdigit, total_page)
#print page_num
for i in range(1,int(page_num)+1):
url=tem_line.rstrip('.htm')+'_'+str(i)+'.htm'
print url
result = requests.get(url, headers=HEADERS).text
img_url = BeautifulSoup(result, 'lxml').find('div', class_='ImageBody').find('p').img['src']
file_name= 'f:/xiuren/'+str(page_index)+'_'+ str(i)+ '.jpg'
print file_name
urllib.urlretrieve(img_url, file_name)
f.close()
import os
import time
import threading
from multiprocessing import Pool, cpu_count
import urllib
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
from bs4 import BeautifulSoup
HEADERS = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Referer': "http://www.umei.cc"
}
url = 'http://www.umei.cc/p/gaoqing/rihan/'
f = open("rihan_umei_url.txt", "w")
for i in range(1, 142):
page_url = url + str(i) + '.htm'
r = requests.get(page_url, headers=HEADERS, timeout=10).text
for sub_page_urls in BeautifulSoup(r, 'lxml').find_all('a', target='_blank'):
sub_page_url = sub_page_urls['href']
print >> f, sub_page_url
f.close()
f = open("E:/python/rihan_umei_url.txt","r")
lines = f.readlines()
for index, line in enumerate(lines):
#line='http://www.umei.cc/p/gaoqing/xiuren_VIP/20160723192129.htm'
tem_line = line.strip('\n')
page_index = filter(str.isdigit, tem_line)
r = requests.get(tem_line, headers=HEADERS, timeout=10).text
for countmax in BeautifulSoup(r, 'lxml').find('div', class_='NewPages').find_all('a',limit=1):
total_page=countmax.get_text().encode("utf-8")
#print total_page
page_num = filter(str.isdigit, total_page)
#print page_num
for i in range(1,int(page_num)+1):
url=tem_line.rstrip('.htm')+'_'+str(i)+'.htm'
print url
result = requests.get(url, headers=HEADERS).text
img_url = BeautifulSoup(result, 'lxml').find('div', class_='ImageBody').find('p').img['src']
file_name= 'f:/xiuren/'+str(page_index)+'_'+ str(i)+ '.jpg'
print file_name
urllib.urlretrieve(img_url, file_name)
f.close()