from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
Xuliqin Practice1127-2
# Downloading imdb top 250 movie's data
= 'http://www.imdb.com/chart/top'
url = requests.get(url)
response = BeautifulSoup(response.text, "html.parser") soup
= soup.select('td.titleColumn')
movies = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
crew = [b.attrs.get('data-value')
ratings for b in soup.select('td.posterColumn span[name=ir]')]
# create a empty list for storing
# movie information
list = []
# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
# Separating movie into: 'place',
# 'title', 'year'
= movies[index].get_text()
movie_string = (' '.join(movie_string.split()).replace('.', ''))
movie = movie[len(str(index))+1:-7]
movie_title = re.search(r'\((.*?)\)', movie_string).group(1)
year = movie[:len(str(index))-(len(movie))]
place = {"place": place,
data "movie_title": movie_title,
"rating": ratings[index],
"year": year,
"star_cast": crew[index],
}list.append(data)
for movie in list:
print(
"place"],
movie["-",
"movie_title"],
movie["(" + movie["year"] + ") -",
"Starring:",
"star_cast"],
movie["rating"],
movie[ )
#saving the list as dataframe
#then converting into .csv file
= pd.DataFrame(list)
df './file/imdb_top_250_movies.csv',index=False) df.to_csv(
# Downloading imdb top 250 movie's data
= 'http://www.imdb.com/chart/top'
url = requests.get(url)
response = BeautifulSoup(response.text, "html.parser")
soup = soup.select('td.titleColumn')
movies = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
crew = [b.attrs.get('data-value')
ratings for b in soup.select('td.posterColumn span[name=ir]')]
# create a empty list for storing
# movie information
list = []
# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
# Separating movie into: 'place',
# 'title', 'year'
= movies[index].get_text()
movie_string = (' '.join(movie_string.split()).replace('.', ''))
movie = movie[len(str(index))+1:-7]
movie_title = re.search(r'\((.*?)\)', movie_string).group(1)
year = movie[:len(str(index))-(len(movie))]
place = {"place": place,
data "movie_title": movie_title,
"rating": ratings[index],
"year": year,
"star_cast": crew[index],
}list.append(data)
# printing movie details with its rating.
for movie in list:
print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
') -', 'Starring:', movie['star_cast'], movie['rating'])
##.......##
= pd.DataFrame(list)
df './file/imdb_top_250_movies.csv',index=False) df.to_csv(