import requests
import pymysql
import re
import time
from lxml import etree
#数据库连接字符串
conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='test', port=3306)
#创建连接数据库及游标对象
cursor = conn.cursor()
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_movie_url(url):
html = requests.get(url,headers=req_head)
selector = etree.HTML(html.text)
movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
for movie_href in movie_hrefs:
get_movie_info(movie_href)
def get_movie_info(url):
html = requests.get(url,headers=req_head)
selector = etree.HTML(html.text)
try:
movie_name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
actor = actors.xpath('string(.)')
style = re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)[0]
country = re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br />',html.text,re.S)[0]
release_time = re.findall('<span property="v:initialReleaseDate" content="1994-09-10(多伦多电影节)">(.*?)</span>',html.text,re.S)[0]
movie_time = re.findall('<span property="v:runtime" content="142">(.*?)</span>',html.text,re.S)[0]
score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
cursor.execute(
"insert into doubanmovie (movie_name,director,actor,style,country,release_time,movie_time,score)values({},{},{},{},{},{},{},{})".format(movie_name,director,actor,style,country,release_time,movie_time,score)
)
except IndexError:
pass
def main():
urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
for url in urls:
get_movie_url(url)
time.sleep(2)
conn.commit()
conn.close()
print("程序执行完毕")
main()
import pymysql
import re
import time
from lxml import etree
#数据库连接字符串
conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='test', port=3306)
#创建连接数据库及游标对象
cursor = conn.cursor()
req_head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_movie_url(url):
html = requests.get(url,headers=req_head)
selector = etree.HTML(html.text)
movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
for movie_href in movie_hrefs:
get_movie_info(movie_href)
def get_movie_info(url):
html = requests.get(url,headers=req_head)
selector = etree.HTML(html.text)
try:
movie_name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
actor = actors.xpath('string(.)')
style = re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)[0]
country = re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br />',html.text,re.S)[0]
release_time = re.findall('<span property="v:initialReleaseDate" content="1994-09-10(多伦多电影节)">(.*?)</span>',html.text,re.S)[0]
movie_time = re.findall('<span property="v:runtime" content="142">(.*?)</span>',html.text,re.S)[0]
score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
cursor.execute(
"insert into doubanmovie (movie_name,director,actor,style,country,release_time,movie_time,score)values({},{},{},{},{},{},{},{})".format(movie_name,director,actor,style,country,release_time,movie_time,score)
)
except IndexError:
pass
def main():
urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0,250,25)]
for url in urls:
get_movie_url(url)
time.sleep(2)
conn.commit()
conn.close()
print("程序执行完毕")
main()