Xuliqin Practice1127-1

import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import textwrap

pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/storms.csv", nrows=10
)

	rownames	name	year	month	day	hour	lat	long	status	category	wind	pressure	tropicalstorm_force_diameter	hurricane_force_diameter
0	1	Amy	1975	6	27	0	27.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
1	2	Amy	1975	6	27	6	28.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
2	3	Amy	1975	6	27	12	29.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
3	4	Amy	1975	6	27	18	30.5	-79.0	tropical depression	NaN	25	1013	NaN	NaN
4	5	Amy	1975	6	28	0	31.5	-78.8	tropical depression	NaN	25	1012	NaN	NaN
5	6	Amy	1975	6	28	6	32.4	-78.7	tropical depression	NaN	25	1012	NaN	NaN
6	7	Amy	1975	6	28	12	33.3	-78.0	tropical depression	NaN	25	1011	NaN	NaN
7	8	Amy	1975	6	28	18	34.0	-77.0	tropical depression	NaN	30	1006	NaN	NaN
8	9	Amy	1975	6	29	0	34.4	-75.8	tropical storm	NaN	35	1004	NaN	NaN
9	10	Amy	1975	6	29	6	34.0	-74.8	tropical storm	NaN	40	1002	NaN	NaN

url = "http://aeturrell.com/research"
page = requests.get(url)
page.text[:300]

'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>\n\n<meta charset="utf-8">\n<meta name="generator" content="quarto-1.6.39">\n\n<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n\n<meta name="author" content="Arthur Turrell">\n'

soup = BeautifulSoup(page.text, "html.parser")
print(soup.prettify()[60000:60500])

TJDdmFjYW5jaWVzJTJDQ09WSUQtMTk=" data-index="1" data-listing-date-modified-sort="NaN" data-listing-date-sort="1651359600000" data-listing-file-modified-sort="1687564711698" data-listing-reading-time-sort="1" data-listing-word-count-sort="182">
         <div class="project-content listing-pub-info">
          <p>
           Draca, Mirko, Emma Duchini, Roland Rathelot, Arthur Turrell, and Giulia Vattuone. Revolution in Progress? The Rise of Remote Work in the UK.
           <i>
            Univers

# Get all paragraphs
all_paras = soup.find_all("p")
# Just show one of the paras
all_paras[1]

<p>Blundell, Jack, Emma Duchini, Stefania Simion, and Arthur Turrell. "Pay transparency and gender equality." <i>American Economic Journal: Economic Policy</i> (2024). doi: <a href="https://www.aeaweb.org/articles?id=10.1257/pol.20220766&amp;from=f"><code>10.1257/pol.20220766</code></a></p>

all_paras[1].text

'Blundell, Jack, Emma Duchini, Stefania Simion, and Arthur Turrell. "Pay transparency and gender equality." American Economic Journal: Economic Policy (2024). doi: 10.1257/pol.20220766'

projects = soup.find_all("div", class_="project-content listing-pub-info")
projects = [x.text.strip() for x in projects]
projects[:4]

['Blundell, Jack, Emma Duchini, Stefania Simion, and Arthur Turrell. "Pay transparency and gender equality." American Economic Journal: Economic Policy (2024). doi: 10.1257/pol.20220766',
 'Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." Environment and Planning B: Urban Analytics and City Science (2024): 23998083241267331. doi: 10.1177/23998083241267331',
 'Kalamara, Eleni, Arthur Turrell, Chris Redl, George Kapetanios, and Sujit Kapadia. "Making text count: economic forecasting using newspaper text." Journal of Applied Econometrics 37, no. 5 (2022): 896-919. doi: 10.1002/jae.2907',
 'Turrell, A., Speigner, B., Copple, D., Djumalieva, J. and Thurgood, J., 2021. Is the UK’s productivity puzzle mostly driven by occupational mismatch? An analysis using big data on job vacancies. Labour Economics, 71, p.102013. doi: 10.1016/j.labeco.2021.102013']

df_list = pd.read_html(
    "https://simple.wikipedia.org/wiki/FIFA_World_Cup", match="Sweden"
)
# Retrieve first and only entry from list of dataframes
df = df_list[0]
df.head()

	Years	Hosts	Winners	Score	Runner's-up	Third place	Score.1	Fourth place
0	1930 Details	Uruguay	Uruguay	4 - 2	Argentina	United States	[note 1]	Yugoslavia
1	1934 Details	Italy	Italy	2 - 1	Czechoslovakia	Germany	3 - 2	Austria
2	1938 Details	France	Italy	4 - 2	Hungary	Brazil	4 - 2	Sweden
3	1950 Details	Brazil	Uruguay	2 - 1	Brazil	Sweden	[note 2]	Spain
4	1954 Details	Switzerland	West Germany	3 - 2	Hungary	Austria	3 - 1	Uruguay