import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
Pandas & Matplotlib
Download this notebook: Pandas_Matplotlib.ipynb.
#Importing Titanic data
= pd.read_csv("_files/titanic.csv")
df_titanic df_titanic
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
Let us plot all the ages of the passengers using pandas .plot() function
#Plotting the "Age" column via .plot()
"Age"].plot() df_titanic[
<Axes: >
Hmmm… seems not really useful…
# Lets try sorting it?
=["Age"]).plot() df_titanic.sort_values(by
<Axes: >
Also not the result we wanted…
Lets try sorting and saving it to a new dataframe? What does it contain?
= df_titanic.sort_values(by=["Age"])
df_titanic_new
df_titanic_new.plot() df_titanic_new
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
803 | 804 | 1 | 3 | Thomas, Master. Assad Alexander | male | 0.42 | 0 | 1 | 2625 | 8.5167 | NaN | C |
755 | 756 | 1 | 2 | Hamalainen, Master. Viljo | male | 0.67 | 1 | 1 | 250649 | 14.5000 | NaN | S |
644 | 645 | 1 | 3 | Baclini, Miss. Eugenie | female | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | C |
469 | 470 | 1 | 3 | Baclini, Miss. Helene Barbara | female | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | C |
78 | 79 | 1 | 2 | Caldwell, Master. Alden Gates | male | 0.83 | 0 | 2 | 248738 | 29.0000 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
859 | 860 | 0 | 3 | Razi, Mr. Raihed | male | NaN | 0 | 0 | 2629 | 7.2292 | NaN | C |
863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | NaN | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
868 | 869 | 0 | 3 | van Melkebeke, Mr. Philemon | male | NaN | 0 | 0 | 345777 | 9.5000 | NaN | S |
878 | 879 | 0 | 3 | Laleff, Mr. Kristo | male | NaN | 0 | 0 | 349217 | 7.8958 | NaN | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
891 rows × 12 columns
The ages are now sorted as shown by the output, but the plot is still the same mess and full of unwanted data!
How about re-indexing everything?
= df_titanic.sort_values(by=["Age"],ascending=True)
df_titanic_new = df_titanic_new.dropna() #Drop NaNs Try including/excluding the entries with NaN
df_titanic_new = df_titanic_new.reset_index(drop=True) #Resetting the index
df_titanic_new #Quick and dirty plot of the dataframe (nan ones dropped) df_titanic_new.plot()
<Axes: >
Ok, now we are getting there! How can we extract just the “Age” data out of the plot?
#Easy, just plot the now sorted "Age" column!
"Age"].plot() df_titanic_new[
<Axes: >
#You can also change the kind of plot you want to get
"Age"].plot(kind="hist", xlabel="Age", ylabel="Amount of passenger") df_titanic_new[
<Axes: xlabel='Age', ylabel='Amount of passenger'>
# Try to sort the fares (x-axis) over the age (y-axis)!
# Hint: .plot(x=..., y=..., kind="scatter")
#<Code goes here>
= df_titanic.loc[:, ["Age", "Fare"]]
df_titanic_new = df_titanic_new.dropna() # Remove rows with missing values
df_titanic_new = df_titanic_new.sort_values(by=["Fare", "Age"]) # Sort by Fare, then Age
df_titanic_new
="Fare", y="Age", kind="scatter", title="Age vs. Fare")
df_titanic_new.plot(x df_titanic_new
Age | Fare | |
---|---|---|
302 | 19.0 | 0.0000 |
271 | 25.0 | 0.0000 |
179 | 36.0 | 0.0000 |
822 | 38.0 | 0.0000 |
806 | 39.0 | 0.0000 |
... | ... | ... |
341 | 24.0 | 263.0000 |
438 | 64.0 | 263.0000 |
258 | 35.0 | 512.3292 |
737 | 35.0 | 512.3292 |
679 | 36.0 | 512.3292 |
714 rows × 2 columns
Pandas plots are great for simple and quick visualization of data, but not very flexible for using multiple and/or complex plots!
# Look at that for example:
#Changing parameters of colors, legends, etc. for each plot will give you a hard time this way!
"Age", "Fare", "Pclass"]].dropna().plot(subplots=True, layout=(2, 2), figsize=(10, 8)) df_titanic[[
array([[<Axes: >, <Axes: >],
[<Axes: >, <Axes: >]], dtype=object)
There is another way to create highly customizable plots in Python!
#As for example:
import matplotlib.pyplot as plt
# Sample customized plot using Matplotlib
=(10, 6)) # Set figure size
plt.figure(figsize"Fare"], df_titanic_new["Age"], color='b', alpha=0.5)
plt.scatter(df_titanic_new["Age vs. Fare - What a nice plot!")
plt.title("Fare")
plt.xlabel("Age")
plt.ylabel(True)
plt.grid( plt.show()
But we shall not get ahead of ourselfs. Slow and steady…
What are Figures und Axis?
The very basics of every plot are shown above!
- A “Figure” is like a piece of paper. Everything needs to be printed on something. You also can print multiple plots in one figure.
- On the next layer are “Axes”. Each Axes object is an individual plot or “subplot” within the Figure and contain specific data such as scaling informations.
- Other elements of the plot(s) like legends, titles, etc. are part of the Axes object and are either inside the plot area (such as data points and gridlines) or just outside it (like axis labels, titles, and legends).
= plt.subplots() #Creating a figure and axes object using matplots subplot function. Yes, even for just a sigle plot!
fig, ax 20)) #Generate some random numbers
ax.plot(np.random.rand('test title') #Set a title
ax.set_title(#And show the plot!
plt.show()
print(type(fig)) #Print the type of the "fig" variable
print(type(ax)) #Print the type of the "ax" variable
<class 'matplotlib.figure.Figure'>
<class 'matplotlib.axes._axes.Axes'>
# Using the subplot method specifically
= 2
row = 2
column
= plt.subplots(row, column)
fig, axes for a in range(row):
for b in range(column):
= axes[a][b]
ax 20))
ax.plot(np.random.rand(f"Plot: ({a, b})")
ax.set_title(
"Hier könnte Ihr Titel stehen!")
fig.suptitle(# Nice method for making your plots a bit prettier. (Adjusts spacing automatically)
fig.tight_layout()
plt.show()
print(type(fig))
print(type(ax))
<class 'matplotlib.figure.Figure'>
<class 'matplotlib.axes._axes.Axes'>
#Code I used to create the data for the following "EnergyUsage.csv"
"""
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
# Datumbereich erstellen (3 Monate, stündlich)
start_date = datetime.datetime(2024, 1, 1)
end_date = start_date + datetime.timedelta(days=90)
date_range = pd.date_range(start=start_date, end=end_date, freq='H')
# Zufällige Werte für Energieverbrauch und Spannung erzeugen
np.random.seed(42) # Für die Reproduzierbarkeit der Zufallszahlen
energy_consumption = np.random.uniform(low=5, high=50, size=len(date_range)) # Mittelwert: 50, Standardabweichung: 10
voltage = np.random.normal(loc=230, scale=5, size=len(date_range)) # Mittelwert: 230V, Standardabweichung: 5
grid_frequency = np.random.normal(loc=50, scale=0.1, size=len(date_range)) # Netzfrequenz um 50 Hz mit etwas Schwankung
# DataFrame erstellen
data = {
'EnergyConsumption': energy_consumption,
'Voltage': voltage,
'GridFrequency': grid_frequency
}
df = pd.DataFrame(data, index=date_range)
df.index.set_names("DateTime", inplace=True)
# Simulierter Stromausfall durch z.B. ein kaputtes Kabel bei Bauarbeiten
start_outage = datetime.datetime(2024, 2, 1, 12, 0, 0) # Annahme: Ein fiktives Startdatum und eine Uhrzeit für den Stromausfall
duration = pd.Timedelta(hours=23) # Dauer des Stromausfalls
# Markiere den Zeitraum des Stromausfalls im DataFrame
df.loc[start_outage:start_outage+duration] = 0
df.to_csv("EnergyUsage.csv")
df
"""
'\nimport pandas as pd\nimport numpy as np\nimport datetime\nimport matplotlib.pyplot as plt\n\n# Datumbereich erstellen (3 Monate, stündlich)\nstart_date = datetime.datetime(2024, 1, 1)\nend_date = start_date + datetime.timedelta(days=90)\ndate_range = pd.date_range(start=start_date, end=end_date, freq=\'H\')\n\n# Zufällige Werte für Energieverbrauch und Spannung erzeugen\nnp.random.seed(42) # Für die Reproduzierbarkeit der Zufallszahlen\nenergy_consumption = np.random.uniform(low=5, high=50, size=len(date_range)) # Mittelwert: 50, Standardabweichung: 10\nvoltage = np.random.normal(loc=230, scale=5, size=len(date_range)) # Mittelwert: 230V, Standardabweichung: 5\ngrid_frequency = np.random.normal(loc=50, scale=0.1, size=len(date_range)) # Netzfrequenz um 50 Hz mit etwas Schwankung\n\n# DataFrame erstellen\ndata = {\n \'EnergyConsumption\': energy_consumption,\n \'Voltage\': voltage,\n \'GridFrequency\': grid_frequency\n}\n\ndf = pd.DataFrame(data, index=date_range)\ndf.index.set_names("DateTime", inplace=True)\n\n# Simulierter Stromausfall durch z.B. ein kaputtes Kabel bei Bauarbeiten\nstart_outage = datetime.datetime(2024, 2, 1, 12, 0, 0) # Annahme: Ein fiktives Startdatum und eine Uhrzeit für den Stromausfall\nduration = pd.Timedelta(hours=23) # Dauer des Stromausfalls\n\n# Markiere den Zeitraum des Stromausfalls im DataFrame\ndf.loc[start_outage:start_outage+duration] = 0\n\ndf.to_csv("EnergyUsage.csv")\ndf\n'
Either run the code to generate EnergyUsage.cvs yourself, or download it here: EnergyUsage.csv.
#First steps in creating plots using matplot
= pd.read_csv("_files/EnergyUsage.csv", index_col="DateTime")
df print(type(df.index)) #Currently our index is just a "simpe index". Use the argument "parse_dates=True" to skip the next step.
<class 'pandas.core.indexes.base.Index'>
= pd.to_datetime(df.index) #Parse the index from "base" to "datetime"
df.index print(type(df.index))
= plt.subplots(figsize=(12,6))
fig, ax
"EnergyConsumption"].plot(color="red", ax=ax, label='Power Usage')
df["GridFrequency"].plot(color="blue", ax=ax, label='Grid Frequency')
df["Voltage"].plot(color="green", ax=ax, label='Grid Voltage')
df[
'Energy Consumption Jan - Mar')
plt.title('Datetime')
plt.xlabel('Power Usage, Frequency, Voltage')
plt.ylabel(
plt.legend()True)
plt.grid(=45)
plt.xticks(rotation
plt.tight_layout() plt.show()
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
= pd.to_datetime(df.index) #Parse the index from "base" to "datetime"
df.index print(type(df.index))
= plt.subplots(1,3,figsize=(12,6))
fig, [ax1, ax2, ax3]
# Plot the Energy Consumption on ax1 (left subplot)
"EnergyConsumption"].plot(color="red", ax=ax1, label='Power Usage')
df['Energy Consumption')
ax1.set_title('Datetime')
ax1.set_xlabel('Power Usage')
ax1.set_ylabel(
ax1.legend()True)
ax1.grid(
# Plot the Grid Frequency on ax2 (middle subplot)
"GridFrequency"].plot(color="blue", ax=ax2, label='Grid Frequency')
df['Grid Frequency')
ax2.set_title('Datetime')
ax2.set_xlabel('Frequency')
ax2.set_ylabel(
ax2.legend()True)
ax2.grid(
# Plot the Voltage on ax3 (right subplot)
"Voltage"].plot(color="green", ax=ax3, label='Grid Voltage')
df['Grid Voltage')
ax3.set_title('Datetime')
ax3.set_xlabel('Voltage')
ax3.set_ylabel(
ax3.legend()True) ax3.grid(
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Tinkering with some cosmetics
= pd.to_datetime(df.index)
df.index
= plt.subplots(figsize=(14, 8))
fig, ax "EnergyConsumption"].plot(color="red", ax=ax, label='Power Usage')
df["GridFrequency"].plot(color="blue", ax=ax, label='Grid Frequency', linestyle="-.")
df["Voltage"].plot(color="green", ax=ax, label='Grid Voltage')
df[
'Energy Consumption Jan - Mar', fontsize=20, color="Orange")
plt.title('Datetime', fontsize=18)
plt.xlabel('Power Usage, Frequency, Voltage', fontsize=18)
plt.ylabel(=12)
plt.legend(fontsize=90)
plt.xticks(rotation
# Hier werden die Anpassungen für die X-Achse vorgenommen
=20)) # Tägliche Intervalle
ax.xaxis.set_major_locator(mdates.DayLocator(interval'%m-%d')) # Datumsformat
ax.xaxis.set_major_formatter(mdates.DateFormatter(
range(0,260,20))
plt.yticks(True)
plt.grid(
plt.tight_layout() plt.show()
#Change the tick intervals on the bottom
= pd.to_datetime(df.index)
df.index
= plt.subplots(figsize=(14, 8))
fig, ax "EnergyConsumption"].plot(color="red", ax=ax, label='Power Usage')
df["GridFrequency"].plot(color="blue", ax=ax, label='Grid Frequency', linestyle="-.")
df["Voltage"].plot(color="green", ax=ax, label='Grid Voltage')
df[
'Energy Consumption Jan - Mar', fontsize=20, color="Orange")
plt.title('Datetime', fontsize=18)
plt.xlabel('Power Usage, Frequency, Voltage', fontsize=18)
plt.ylabel(=12)
plt.legend(fontsize=90)
plt.xticks(rotation
# Hier werden die Anpassungen für die X-Achse vorgenommen
=28)) # Tägliche Intervalle
ax.xaxis.set_major_locator(mdates.DayLocator(interval'%m-%d')) # Datumsformat
ax.xaxis.set_major_formatter(mdates.DateFormatter(
range(0,260,20))
plt.yticks(True)
plt.grid(
plt.tight_layout() plt.show()
#Creating mutliple Y-Axis and "cleaning" the X-Axis
= pd.to_datetime(df.index)
df.index
= plt.subplots(figsize=(14, 8))
fig, ax1
#First Y-Axis
= 'tab:blue'
color2 "Datum")
ax1.set_xlabel("Power Usage", color=color2)
ax1.set_ylabel("EnergyConsumption"], color=color2, label="Power Usage")
ax1.plot(df.index, df[='y', labelcolor=color2)
ax1.tick_params(axisrange(0,62,2))
ax1.set_yticks(
ax1.grid()'%Y-%m-%d') for date in df.index], rotation=45, ha='right')
ax1.set_xticklabels([date.strftime(
=2)) # Jeder zweite Tag wird auf den x-Ticks dargestellt
ax1.xaxis.set_major_locator(mdates.DayLocator(interval'%Y-%m-%d'))
ax1.xaxis.set_major_formatter(mdates.DateFormatter(
#Second Y-Axis
= ax1.twinx()
ax2 = "green"
color3 "Grid Voltage", color=color3)
ax2.set_ylabel("Voltage"], color=color3, label="Grid Voltage")
ax2.plot(df.index, df[='y', labelcolor=color3)
ax2.tick_params(axisrange(0,260, 10))
ax2.set_yticks(
#Third Y-Axis
= 'tab:red' # Verwendung von tab:red anstelle von red ist eine spezielle Konvention in Matplotlib für die Verwendung von "Tableau Colors".
color1 = ax1.twinx()
ax3 "Grid Frequency", color=color1)
ax3.set_ylabel("GridFrequency"], color=color1, label="Grid Frequency")
ax3.plot(df.index, df[='y', labelcolor=color1)
ax3.tick_params(axisrange(0,52,2))
ax3.set_yticks("right"].set_position(("outward", 60))
ax3.spines[
# Combine the Linelabels
= ax1.get_legend_handles_labels()
lines, labels = ax2.get_legend_handles_labels()
lines2, labels2 = ax3.get_legend_handles_labels()
lines3, labels3 += lines2 + lines3
lines += labels2 + labels3
labels
#Command for the positioning of the legend: loc="POSITION"
#Usable commands are: Best, upper/lower left/right, right, center left/right, lower/upper center and center
="best")
plt.legend(lines, labels, loc0], df.index[len(df)-1]))
plt.xlim((df.index["Energy Consumption Jan - Mar")
plt.title( plt.show()
C:\Users\nicom\AppData\Local\Temp\ipykernel_5468\4157523258.py:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
ax1.set_xticklabels([date.strftime('%Y-%m-%d') for date in df.index], rotation=45, ha='right')