Xuliqin Practice1106

%pip install pandas matplotlib numpy pathlib pingouin lets_plot

Requirement already satisfied: pandas in d:\program files\python3.12.7\lib\site-packages (2.2.3)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: matplotlib in d:\program files\python3.12.7\lib\site-packages (3.9.2)
Requirement already satisfied: numpy in d:\program files\python3.12.7\lib\site-packages (1.26.4)
Requirement already satisfied: pathlib in d:\program files\python3.12.7\lib\site-packages (1.0.1)
Requirement already satisfied: pingouin in d:\program files\python3.12.7\lib\site-packages (0.5.5)
Requirement already satisfied: lets_plot in d:\program files\python3.12.7\lib\site-packages (4.5.1)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\program files\python3.12.7\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in d:\program files\python3.12.7\lib\site-packages (from pandas) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in d:\program files\python3.12.7\lib\site-packages (from pandas) (2024.2)
Requirement already satisfied: contourpy>=1.0.1 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: cycler>=0.10 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: packaging>=20.0 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=8 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (11.0.0)
Requirement already satisfied: pyparsing>=2.3.1 in d:\program files\python3.12.7\lib\site-packages (from matplotlib) (3.2.0)
Requirement already satisfied: pandas-flavor in d:\program files\python3.12.7\lib\site-packages (from pingouin) (0.6.0)
Requirement already satisfied: scikit-learn>=1.2 in d:\program files\python3.12.7\lib\site-packages (from pingouin) (1.6.0)
Requirement already satisfied: scipy in d:\program files\python3.12.7\lib\site-packages (from pingouin) (1.14.1)
Requirement already satisfied: seaborn in d:\program files\python3.12.7\lib\site-packages (from pingouin) (0.13.2)
Requirement already satisfied: statsmodels in d:\program files\python3.12.7\lib\site-packages (from pingouin) (0.14.4)
Requirement already satisfied: tabulate in d:\program files\python3.12.7\lib\site-packages (from pingouin) (0.9.0)
Requirement already satisfied: pypng in d:\program files\python3.12.7\lib\site-packages (from lets_plot) (0.20220715.0)
Requirement already satisfied: palettable in d:\program files\python3.12.7\lib\site-packages (from lets_plot) (3.3.3)
Requirement already satisfied: six>=1.5 in d:\program files\python3.12.7\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: joblib>=1.2.0 in d:\program files\python3.12.7\lib\site-packages (from scikit-learn>=1.2->pingouin) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in d:\program files\python3.12.7\lib\site-packages (from scikit-learn>=1.2->pingouin) (3.5.0)
Requirement already satisfied: xarray in d:\program files\python3.12.7\lib\site-packages (from pandas-flavor->pingouin) (2024.11.0)
Requirement already satisfied: patsy>=0.5.6 in d:\program files\python3.12.7\lib\site-packages (from statsmodels->pingouin) (1.0.1)


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: pythonw.exe -m pip install --upgrade pip

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pingouin as pg
# !pip install lets-plot
from lets_plot import *
LetsPlot.setup_html(no_js=True)
plt.style.use("https://raw.githubusercontent.com/aeturrell/core_python/main/plot_style.txt")

df = pd.read_csv(
    "https://data.giss.nasa.gov/gistemp/tabledata_v4/NH.Ts+dSST.csv",
    skiprows=1,
    na_values="***",
)
df.head()

	Year	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	J-D	D-N	DJF	MAM	JJA	SON
0	1880	-0.39	-0.53	-0.23	-0.30	-0.05	-0.18	-0.22	-0.25	-0.24	-0.30	-0.43	-0.42	-0.30	NaN	NaN	-0.20	-0.22	-0.32
1	1881	-0.31	-0.25	-0.06	-0.02	0.05	-0.34	0.09	-0.06	-0.28	-0.44	-0.37	-0.24	-0.19	-0.20	-0.33	-0.01	-0.10	-0.37
2	1882	0.25	0.21	0.02	-0.30	-0.23	-0.29	-0.28	-0.15	-0.25	-0.52	-0.33	-0.68	-0.21	-0.17	0.08	-0.17	-0.24	-0.37
3	1883	-0.57	-0.66	-0.15	-0.30	-0.26	-0.12	-0.06	-0.23	-0.34	-0.17	-0.44	-0.15	-0.29	-0.33	-0.64	-0.23	-0.14	-0.32
4	1884	-0.16	-0.11	-0.64	-0.59	-0.36	-0.41	-0.41	-0.52	-0.45	-0.44	-0.58	-0.47	-0.43	-0.40	-0.14	-0.53	-0.45	-0.49

df.info()
na_values="***"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    145 non-null    int64  
 1   Jan     145 non-null    float64
 2   Feb     145 non-null    float64
 3   Mar     145 non-null    float64
 4   Apr     145 non-null    float64
 5   May     145 non-null    float64
 6   Jun     145 non-null    float64
 7   Jul     145 non-null    float64
 8   Aug     145 non-null    float64
 9   Sep     145 non-null    float64
 10  Oct     145 non-null    float64
 11  Nov     145 non-null    float64
 12  Dec     144 non-null    float64
 13  J-D     144 non-null    float64
 14  D-N     144 non-null    float64
 15  DJF     144 non-null    float64
 16  MAM     145 non-null    float64
 17  JJA     145 non-null    float64
 18  SON     145 non-null    float64
dtypes: float64(18), int64(1)
memory usage: 21.7 KB

df = df.set_index("Year")
df.head()

	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	J-D	D-N	DJF	MAM	JJA	SON
Year
1880	-0.39	-0.53	-0.23	-0.30	-0.05	-0.18	-0.22	-0.25	-0.24	-0.30	-0.43	-0.42	-0.30	NaN	NaN	-0.20	-0.22	-0.32
1881	-0.31	-0.25	-0.06	-0.02	0.05	-0.34	0.09	-0.06	-0.28	-0.44	-0.37	-0.24	-0.19	-0.20	-0.33	-0.01	-0.10	-0.37
1882	0.25	0.21	0.02	-0.30	-0.23	-0.29	-0.28	-0.15	-0.25	-0.52	-0.33	-0.68	-0.21	-0.17	0.08	-0.17	-0.24	-0.37
1883	-0.57	-0.66	-0.15	-0.30	-0.26	-0.12	-0.06	-0.23	-0.34	-0.17	-0.44	-0.15	-0.29	-0.33	-0.64	-0.23	-0.14	-0.32
1884	-0.16	-0.11	-0.64	-0.59	-0.36	-0.41	-0.41	-0.52	-0.45	-0.44	-0.58	-0.47	-0.43	-0.40	-0.14	-0.53	-0.45	-0.49

df.tail()

	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	J-D	D-N	DJF	MAM	JJA	SON
Year
2020	1.59	1.69	1.66	1.39	1.27	1.14	1.10	1.12	1.19	1.21	1.58	1.18	1.34	1.36	1.56	1.44	1.12	1.33
2021	1.25	0.96	1.20	1.13	1.05	1.21	1.07	1.02	1.05	1.29	1.29	1.17	1.14	1.14	1.13	1.13	1.10	1.21
2022	1.24	1.16	1.41	1.09	1.02	1.13	1.06	1.17	1.15	1.31	1.09	1.06	1.16	1.17	1.19	1.17	1.12	1.19
2023	1.29	1.29	1.64	1.01	1.13	1.19	1.44	1.57	1.67	1.88	1.97	1.85	1.50	1.43	1.22	1.26	1.40	1.84
2024	1.67	1.93	1.77	1.79	1.44	1.54	1.42	1.42	1.58	1.72	1.90	NaN	NaN	1.67	1.82	1.67	1.46	1.73

fig, ax = plt.subplots()
df["Jan"].plot(ax=ax)
ax.set_ylabel("y label")
ax.set_xlabel("x label")
ax.set_title("title")
plt.show()

fig, ax = plt.subplots()
ax.plot(df.index, df["Jan"])
ax.set_ylabel("y label")
ax.set_xlabel("x label")
ax.set_title("title")
plt.show()

month = "Jan"
fig, ax = plt.subplots()
ax.axhline(0, color="orange")
ax.annotate("1951—1980 average", xy=(0.66, -0.2), xycoords=("figure fraction", "data"))
df[month].plot(ax=ax)
ax.set_title(
    f"Average temperature anomaly in {month} \n in the northern hemisphere (1880—{df.index.max()})"
)
ax.set_ylabel("Annual temperature anomalies")

Text(0, 0.5, 'Annual temperature anomalies')

month = "J-D"
fig, ax = plt.subplots()
ax.axhline(0, color="orange")
ax.annotate("1951—1980 average", xy=(0.68, -0.2), xycoords=("figure fraction", "data"))
df[month].plot(ax=ax)
ax.set_title(
    f"Average annual temperature anomaly in \n in the northern hemisphere (1880—{df.index.max()})"
)
ax.set_ylabel("Annual temperature anomalies")

Text(0, 0.5, 'Annual temperature anomalies')

df["Period"] = pd.cut(
    df.index,
    bins=[1921, 1950, 1980, 2010],
    labels=["1921—1950", "1951—1980", "1981—2010"],
    ordered=True,
)
df["Period"].tail(20)

Year
2005    1981—2010
2006    1981—2010
2007    1981—2010
2008    1981—2010
2009    1981—2010
2010    1981—2010
2011          NaN
2012          NaN
2013          NaN
2014          NaN
2015          NaN
2016          NaN
2017          NaN
2018          NaN
2019          NaN
2020          NaN
2021          NaN
2022          NaN
2023          NaN
2024          NaN
Name: Period, dtype: category
Categories (3, object): ['1921—1950' < '1951—1980' < '1981—2010']

list_of_months = ["Jun", "Jul", "Aug"]
df[list_of_months].stack().head()

Year     
1880  Jun   -0.18
      Jul   -0.22
      Aug   -0.25
1881  Jun   -0.34
      Jul    0.09
dtype: float64

fig, axes = plt.subplots(ncols=3, figsize=(9, 4), sharex=True, sharey=True)
for ax, period in zip(axes, df["Period"].dropna().unique()):
    df.loc[df["Period"] == period, list_of_months].stack().hist(ax=ax)
    ax.set_title(period)
plt.suptitle("Histogram of temperature anomalies")
axes[1].set_xlabel("Summer temperature distribution")
plt.tight_layout()

# Create a variable that has years 1951 to 1980, and months Jan to Dec (inclusive)
temp_all_months = df.loc[(df.index >= 1951) & (df.index <= 1980), "Jan":"Dec"]
# Put all the data in stacked format and give the new columns sensible names
temp_all_months = (
    temp_all_months.stack()
    .reset_index()
    .rename(columns={"level_1": "month", 0: "values"})
)
# Take a look at this data:
temp_all_months

	Year	month	values
0	1951	Jan	-0.36
1	1951	Feb	-0.51
2	1951	Mar	-0.18
3	1951	Apr	0.06
4	1951	May	0.17
...	...	...	...
355	1980	Aug	0.10
356	1980	Sep	0.10
357	1980	Oct	0.12
358	1980	Nov	0.21
359	1980	Dec	0.09

360 rows × 3 columns

quantiles = [0.3, 0.7]
list_of_percentiles = np.quantile(temp_all_months["values"], q=quantiles)

print(f"The cold threshold of {quantiles[0]*100}% is {list_of_percentiles[0]}")
print(f"The hot threshold of {quantiles[1]*100}% is {list_of_percentiles[1]}")

The cold threshold of 30.0% is -0.1
The hot threshold of 70.0% is 0.1

# Create a variable that has years 1981 to 2010, and months Jan to Dec (inclusive)
temp_all_months = df.loc[(df.index >= 1981) & (df.index <= 2010), "Jan":"Dec"]
# Put all the data in stacked format and give the new columns sensible names
temp_all_months = (
    temp_all_months.stack()
    .reset_index()
    .rename(columns={"level_1": "month", 0: "values"})
)
# Take a look at the start of this data data:
temp_all_months.head()

	Year	month	values
0	1981	Jan	0.80
1	1981	Feb	0.62
2	1981	Mar	0.68
3	1981	Apr	0.39
4	1981	May	0.18

entries_less_than_q30 = temp_all_months["values"] < list_of_percentiles[0]
proportion_under_q30 = entries_less_than_q30.mean()
print(
    f"The proportion under {list_of_percentiles[0]} is {proportion_under_q30*100:.2f}%"
)

The proportion under -0.1 is 1.94%

proportion_over_q70 = (temp_all_months["values"] > list_of_percentiles[1]).mean()
print(f"The proportion over {list_of_percentiles[1]} is {proportion_over_q70*100:.2f}%")

The proportion over 0.1 is 84.72%

temp_all_months = (
    df.loc[:, "DJF":"SON"]
    .stack()
    .reset_index()
    .rename(columns={"level_1": "Season", 0: "Values"})
)
temp_all_months["Period"] = pd.cut(
    temp_all_months["Year"],
    bins=[1921, 1950, 1980, 2010],
    labels=["1921—1950", "1951—1980", "1981—2010"],
    ordered=True,
)
# Take a look at a cut of the data using `.iloc`, which provides position
temp_all_months.iloc[-135:-125]

	Year	Season	Values	Period
444	1991	MAM	0.45	1981—2010
445	1991	JJA	0.42	1981—2010
446	1991	SON	0.32	1981—2010
447	1992	DJF	0.43	1981—2010
448	1992	MAM	0.29	1981—2010
449	1992	JJA	-0.04	1981—2010
450	1992	SON	-0.15	1981—2010
451	1993	DJF	0.37	1981—2010
452	1993	MAM	0.31	1981—2010
453	1993	JJA	0.12	1981—2010

grp_mean_var = temp_all_months.groupby(["Season", "Period"], observed=False)["Values"].agg(
    ["mean", "var"]
)
grp_mean_var

		mean	var
Season	Period
DJF	1921—1950	-0.025862	0.057489
	1951—1980	-0.002000	0.050548
	1981—2010	0.523333	0.078975
JJA	1921—1950	-0.053448	0.021423
	1951—1980	0.000000	0.014697
	1981—2010	0.400000	0.067524
MAM	1921—1950	-0.041034	0.031302
	1951—1980	0.000333	0.025245
	1981—2010	0.509333	0.075737
SON	1921—1950	0.083448	0.027473
	1951—1980	-0.001333	0.026205
	1981—2010	0.429000	0.111127

min_year = 1880
(
    ggplot(temp_all_months, aes(x="Year", y="Values", color="Season"))
    + geom_abline(slope=0, color="black", size=1)
    + geom_line(size=1)
    + labs(
        title=f"Average annual temperature anomaly in \n in the northern hemisphere ({min_year}—{temp_all_months['Year'].max()})",
        y="Annual temperature anomalies",
    )
    + scale_x_continuous(format="d")
    + geom_text(
        x=min_year, y=0.1, label="1951—1980 average", hjust="left", color="black"
    )
)

df_co2 = pd.read_csv("./file/data2.csv")
df_co2.head()

	Year	Month	Monthly average	Interpolated	Trend
0	1958	3	315.71	315.71	314.62
1	1958	4	317.45	317.45	315.29
2	1958	5	317.50	317.50	314.71
3	1958	6	-99.99	317.10	314.85
4	1958	7	315.86	315.86	314.98

df_co2_june = df_co2.loc[df_co2["Month"] == 6]
df_co2_june.head()

	Year	Month	Monthly average	Interpolated	Trend
3	1958	6	-99.99	317.10	314.85
15	1959	6	318.15	318.15	315.92
27	1960	6	319.59	319.59	317.36
39	1961	6	319.77	319.77	317.48
51	1962	6	320.55	320.55	318.27

df_temp_co2 = pd.merge(df_co2_june, df, on="Year")
df_temp_co2[["Year", "Jun", "Trend"]].head()

	Year	Jun	Trend
0	1958	0.05	314.85
1	1959	0.14	315.92
2	1960	0.18	317.36
3	1961	0.18	317.48
4	1962	-0.13	318.27

(
    ggplot(df_temp_co2, aes(x="Jun", y="Trend"))
    + geom_point(color="black", size=3)
    + labs(
        title="Scatterplot of temperature anomalies vs carbon dioxide emissions",
        y="Carbon dioxide levels (trend, mole fraction)",
        x="Temperature anomaly (degrees Celsius)",
    )
)

df_temp_co2[["Jun", "Trend"]].corr(method="pearson")

	Jun	Trend
Jun	1.000000	0.915419
Trend	0.915419	1.000000

(
    ggplot(df_temp_co2, aes(x="Year", y="Jun"))
    + geom_line(size=1)
    + labs(
        title="June temperature anomalies",
    )
    + scale_x_continuous(format="d")
)

base_plot = ggplot(df_temp_co2) + scale_x_continuous(format="d")
plot_p = (
    base_plot
    + geom_line(aes(x="Year", y="Jun"), size=1)
    + labs(title="June temperature anomalies")
)
plot_q = (
    base_plot
    + geom_line(aes(x="Year", y="Trend"), size=1)
    + labs(title="Carbon dioxide emissions")
)
gggrid([plot_p, plot_q], ncol=2)