Example questions
Questions will be asked along the course’s learning goals (e.g., “learn how to scrape”) and six cognitive skill levels (e.g., “application”) (for details, see here). Below, you can find a few example questions. Example questions will be discussed with students in the final live stream of this course.
The exam consists of open (answer boxes, file uploads) and closed (multiple-choice) questions. You can go freely back and forth between questions.
Note: the number of questions depends on the points awarded to each question. The instructions during the final exam may slightly vary, so make sure to still read it accordingly.
According to “Fields of Gold”, what are useful dimensions to distinguish websites when considering them for a scraping project? (knowledge)
What is the purpose of
beautifulSoup
in the context of web scraping? (comprehension)Please describe the difference between “parsing on the fly” versus “parsing after the data collection” (according to “Fields of Gold”). (comprehension)
What conclusions can you draw with regard to the accuracy of the timestamps provided in the user review section of metacritic (e.g., https://www.metacritic.com/movie/the-godfather/user-reviews )? Provide a short answer in less than 50 words. (analysis)
Please take a look at the code snippet below, which retrieves data on the title and number of comments for posts on the ‘marketing’ subreddit. Please modify the code such that this data is extracted for the ‘digitalmarketing’ and ‘socialmedia’ subreddits, in addition to the ‘marketing’ subreddit. The output should be a list with dictionaries including the subreddit name, title and number of comments for each post.
import requests headers = {'authority': 'www.reddit.com', 'cache-control': 'max-age=10', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en;q=0.9'} def get_posts(subreddit): url = f'https://www.reddit.com/r/{subreddit}.json' response = requests.get(url, headers=headers) json_response = response.json() posts = [] for item in json_response['data']['children']: posts.append({'title': item['data']['title'], 'number of comments:': item['data']['num_comments']}) return posts posts = get_users('marketing') posts
Solution
import requests headers = {'authority': 'www.reddit.com', 'cache-control': 'max-age=10', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1', 'sec-fetch-dest': 'document', 'accept-language': 'en-GB,en;q=0.9'} def get_posts(subreddit): url = f'https://www.reddit.com/r/{subreddit}.json' response = requests.get(url, headers=headers) json_response = response.json() posts = [] for item in json_response['data']['children']: posts.append({'subreddit name': item['data']['subreddit'], 'title': item['data']['title'], 'number of comments:': item['data']['num_comments']}) return posts subreddits = ['marketing', 'digitalmarketing', 'socialmedia'] all_posts = [] # create empty list to hold final results # loop through subreddits for sub in subreddits: # use `get_users()` function to retrieve post for subreddit `sub` retrieved_posts = get_posts(sub) # loop through posts, and add to `posts` list holding all posts as a final result for post in retrieved_posts: all_posts.append(post) all_posts
Please evaluate the feasibility of this data extraction plan:
example data extraction plan*
Please collect the product names and prices for all books listed in this section: https://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html. Please not only list these variables, but also provide timestamps from the moment that you started your data collection. Submit your (a) Python code (as
.py
or.ipynb
), along with the collected data (.json
). Please start from the code snippet below.Code to start from
# import packages import requests from bs4 import BeautifulSoup import time import json # set url page_url = 'https://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html' res = requests.get(page_url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, "html.parser") books = soup.find_all(class_="product_pod") for book in books: name = book.find('h3').get_text() print(name)
Solution
# import packages import requests from bs4 import BeautifulSoup import time import json # set url page_url = 'https://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html' def get_books(page_url): res = requests.get(page_url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, "html.parser") books = soup.find_all(class_="product_pod") time_start = int(time.time()) book_list = [] for book in books: name = book.find('h3').get_text() price = book.find('p', class_='price_color').get_text() return_dic = {'name': name, 'price': price, 'time_start': time_start} book_list.append(return_dic) return(book_list) # write a function that checks whether there is a next page def check_next_page(url): res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") next_btn = soup.find(class_= "next") return next_btn.find("a").attrs["href"] if next_btn else None all_books = [] while page_url: print(page_url) for book in get_books(page_url): all_books.append(book) if check_next_page(page_url) != None: page_url = "https://books.toscrape.com/catalogue/category/books/nonfiction_13/" + check_next_page(page_url) else: break # write to file f = open('all_books.json', 'w') for book in all_books: f.write(json.dumps(book)+'\n') f.close()
Scrape the top 1000 lifetime grossing movies (domestic) from Box Office Mojo. Export the rank, title, lifetime gross and release year of these movies to a CSV file.
Code to start from
# import packages import requests url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW&offset=0' res = requests.get(url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, "html.parser") movies = soup.find(class_='imdb-scroll-table')
Solution
import requests import csv top_1000=[] for i in range(5): page_url='https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset='+ str(i*200) res = requests.get(page_url) res.encoding = res.apparent_encoding soup = BeautifulSoup(res.text, "html.parser") movies = soup.find('table', class_= 'mojo-body-table') j=0 for movie in movies: if j!=0: ranking= movie.find('td', class_='a-text-right').get_text() title= movie.find('a', class_='a-link-normal').get_text() lifetime_gross=movie.find('td', class_='mojo-field-type-money').get_text() release_year= movie.find('td', class_='mojo-field-type-year').get_text() top_1000.append({'ranking':ranking, 'title': title, 'lifetime_gross': lifetime_gross, 'release_year': release_year}) j=j+1 with open('top_1000movies.csv', mode='w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=['ranking', 'title', 'lifetime_gross', 'release_year']) writer.writeheader() writer.writerows(top_1000)