Pandas: File Input/Output
Contents
Pandas: File Input/Output#
For more cf. Pandas IO Tools.
import pandas as pd
import numpy as np
CSV: comma separated values#
df = pd.DataFrame(10*np.random.randn(10,3)+3,
columns= ['temperature','CO2','energy consumption'],
index = range(2000,2010))
df
temperature | CO2 | energy consumption | |
---|---|---|---|
2000 | -12.286623 | -9.017741 | 4.175555 |
2001 | -0.206236 | 2.605373 | -5.076079 |
2002 | -8.669134 | 10.150683 | 3.959053 |
2003 | -2.827104 | -16.340451 | -6.078593 |
2004 | -16.600510 | -0.214877 | 1.328272 |
2005 | 19.249710 | -4.046548 | 15.387640 |
2006 | 2.724853 | 7.748419 | 9.717703 |
2007 | 0.738379 | 26.695446 | 1.515813 |
2008 | -7.855695 | -1.237906 | -3.721848 |
2009 | -11.135411 | 1.170897 | -9.016716 |
df.to_csv('daten/my_test_data.csv')
Delete the data frame and read it in from the csv-file:
del df
df= pd.read_csv('daten/my_test_data.csv')
df
Unnamed: 0 | temperature | CO2 | energy consumption | |
---|---|---|---|---|
0 | 2000 | -12.286623 | -9.017741 | 4.175555 |
1 | 2001 | -0.206236 | 2.605373 | -5.076079 |
2 | 2002 | -8.669134 | 10.150683 | 3.959053 |
3 | 2003 | -2.827104 | -16.340451 | -6.078593 |
4 | 2004 | -16.600510 | -0.214877 | 1.328272 |
5 | 2005 | 19.249710 | -4.046548 | 15.387640 |
6 | 2006 | 2.724853 | 7.748419 | 9.717703 |
7 | 2007 | 0.738379 | 26.695446 | 1.515813 |
8 | 2008 | -7.855695 | -1.237906 | -3.721848 |
9 | 2009 | -11.135411 | 1.170897 | -9.016716 |
df= pd.read_csv('daten/my_test_data.csv', index_col=0)
df
temperature | CO2 | energy consumption | |
---|---|---|---|
2000 | -12.286623 | -9.017741 | 4.175555 |
2001 | -0.206236 | 2.605373 | -5.076079 |
2002 | -8.669134 | 10.150683 | 3.959053 |
2003 | -2.827104 | -16.340451 | -6.078593 |
2004 | -16.600510 | -0.214877 | 1.328272 |
2005 | 19.249710 | -4.046548 | 15.387640 |
2006 | 2.724853 | 7.748419 | 9.717703 |
2007 | 0.738379 | 26.695446 | 1.515813 |
2008 | -7.855695 | -1.237906 | -3.721848 |
2009 | -11.135411 | 1.170897 | -9.016716 |
Excel#
df.to_excel('daten/my_test_data.xlsx')
Delete the data frame and read it in from the xlsx-file:
del df
df = pd.read_excel('daten/my_test_data.xlsx', sheet_name='Sheet1')
df
Unnamed: 0 | temperature | CO2 | energy consumption | |
---|---|---|---|---|
0 | 2000 | -12.286623 | -9.017741 | 4.175555 |
1 | 2001 | -0.206236 | 2.605373 | -5.076079 |
2 | 2002 | -8.669134 | 10.150683 | 3.959053 |
3 | 2003 | -2.827104 | -16.340451 | -6.078593 |
4 | 2004 | -16.600510 | -0.214877 | 1.328272 |
5 | 2005 | 19.249710 | -4.046548 | 15.387640 |
6 | 2006 | 2.724853 | 7.748419 | 9.717703 |
7 | 2007 | 0.738379 | 26.695446 | 1.515813 |
8 | 2008 | -7.855695 | -1.237906 | -3.721848 |
9 | 2009 | -11.135411 | 1.170897 | -9.016716 |
Many more file formats can be written and read, cf. Pandas IO Tools.
Date and Time Data#
For more cf. Pandas Time series / date functionality.
import datetime as dt
now = dt.datetime.now()
now
datetime.datetime(2022, 8, 29, 14, 51, 23, 765003)
print(now.year)
print(now.month)
print(now.day)
print(now.year)
print(now.hour)
print(now.minute)
# etc.
2022
8
29
2022
14
51
timestamp1 = dt.datetime(2014, 3, 28, 11, 30)
timestamp1
datetime.datetime(2014, 3, 28, 11, 30)
timestamp2 = dt.datetime(1614, 12, 24, 19, 30)
timestamp2
datetime.datetime(1614, 12, 24, 19, 30)
delta = timestamp1 - timestamp2
delta
datetime.timedelta(days=145825, seconds=57600)
print(delta.days)
print(delta.seconds)
print(delta.total_seconds())
145825
57600
12599337600.0
timestamp1 + dt.timedelta(14)
datetime.datetime(2014, 4, 11, 11, 30)
Converting between string and datetime#
stamp = dt.datetime(2011, 1, 3, 17, 45)
type(stamp)
datetime.datetime
str(stamp)
'2011-01-03 17:45:00'
stamp.strftime('%Y/%B/%d-%H')
'2011/January/03-17'
Directive |
Meaning |
---|---|
|
Locale’s abbreviated weekday name |
|
Locale’s full weekday name |
|
Locale’s abbreviated month name |
|
Locale’s full month name |
|
Locale’s appropriate date and time representation |
|
Day of the month as a decimal number [01,31] |
|
Hour (24-hour clock) as a decimal number [00,23] |
|
Hour (12-hour clock) as a decimal number [01,12] |
|
Day of the year as a decimal number [001,366] |
|
Month as a decimal number [01,12] |
|
Minute as a decimal number [00,59] |
|
Locale’s equivalent of either AM or PM |
|
Second as a decimal number [00,61] |
|
Week number of the year (Sunday as the first day of the week) |
|
Weekday as a decimal number [0(Sunday),6] |
|
Week number of the year (Monday as the first day of the week) |
|
Locale’s appropriate date representation |
|
Locale’s appropriate time representation |
|
Year without century as a decimal number [00,99] |
|
Year with century as a decimal number |
|
Time zone name (no characters if no time zone exists) |
|
A literal |
For more on format strings cf. http://docs.python.org/2/library/time.html#time.strftime.
dt.datetime.strptime('24/12/2012','%d/%m/%Y')
datetime.datetime(2012, 12, 24, 0, 0)
Time Series Basics#
Synthetic data:
df = pd.DataFrame(np.random.randn(10,3),
columns= ['temperature','CO2','energy consumption'],
index = range(2000,2010))
df
temperature | CO2 | energy consumption | |
---|---|---|---|
2000 | -0.175572 | 1.750254 | 0.867122 |
2001 | -0.056392 | 0.063977 | -0.410385 |
2002 | -1.324750 | -0.037674 | 0.630271 |
2003 | -0.867190 | 0.078205 | -0.078580 |
2004 | -0.359342 | -0.782809 | 0.153157 |
2005 | 1.259952 | 0.016337 | -0.935126 |
2006 | -1.323949 | -1.755193 | -0.449625 |
2007 | -2.010301 | -0.642709 | -0.381315 |
2008 | 0.026720 | -0.491324 | -1.285634 |
2009 | -0.218182 | -0.546904 | 0.283252 |
df.index
RangeIndex(start=2000, stop=2010, step=1)
pd.to_datetime(df.index, format='%Y')
DatetimeIndex(['2000-01-01', '2001-01-01', '2002-01-01', '2003-01-01',
'2004-01-01', '2005-01-01', '2006-01-01', '2007-01-01',
'2008-01-01', '2009-01-01'],
dtype='datetime64[ns]', freq=None)
df.index = pd.to_datetime(df.index, format='%Y')
df
temperature | CO2 | energy consumption | |
---|---|---|---|
2000-01-01 | -0.175572 | 1.750254 | 0.867122 |
2001-01-01 | -0.056392 | 0.063977 | -0.410385 |
2002-01-01 | -1.324750 | -0.037674 | 0.630271 |
2003-01-01 | -0.867190 | 0.078205 | -0.078580 |
2004-01-01 | -0.359342 | -0.782809 | 0.153157 |
2005-01-01 | 1.259952 | 0.016337 | -0.935126 |
2006-01-01 | -1.323949 | -1.755193 | -0.449625 |
2007-01-01 | -2.010301 | -0.642709 | -0.381315 |
2008-01-01 | 0.026720 | -0.491324 | -1.285634 |
2009-01-01 | -0.218182 | -0.546904 | 0.283252 |
Read an Excel file with a column already formated as date:
df = pd.read_excel('daten/my_test_data_2.xls', sheet_name='Sheet1', index_col=0)
df
temperature | CO2 | energy consumption | |
---|---|---|---|
date | |||
2000-06-15 | 0.294403 | -0.484523 | 0.642076 |
2001-06-15 | -1.146592 | 0.824076 | -0.756397 |
2002-06-15 | 1.345931 | -0.446263 | -0.139374 |
2003-06-15 | -0.840468 | -1.695912 | -0.674641 |
2004-06-15 | 0.473816 | -1.183667 | -1.006740 |
2005-06-15 | 0.957741 | 0.214721 | -0.750404 |
2006-06-15 | 1.361329 | 1.388585 | 0.492154 |
2007-06-15 | 0.559999 | 0.417268 | 0.793223 |
2008-06-15 | 0.301742 | 0.068523 | 0.645997 |
2009-06-15 | 0.936508 | 1.420025 | -0.793608 |
df.iloc[0]
temperature 0.294403
CO2 -0.484523
energy consumption 0.642076
Name: 2000-06-15 00:00:00, dtype: float64
df['time'] = df.index - dt.datetime(2000,1,1)
df
temperature | CO2 | energy consumption | time | |
---|---|---|---|---|
date | ||||
2000-06-15 | 0.294403 | -0.484523 | 0.642076 | 166 days |
2001-06-15 | -1.146592 | 0.824076 | -0.756397 | 531 days |
2002-06-15 | 1.345931 | -0.446263 | -0.139374 | 896 days |
2003-06-15 | -0.840468 | -1.695912 | -0.674641 | 1261 days |
2004-06-15 | 0.473816 | -1.183667 | -1.006740 | 1627 days |
2005-06-15 | 0.957741 | 0.214721 | -0.750404 | 1992 days |
2006-06-15 | 1.361329 | 1.388585 | 0.492154 | 2357 days |
2007-06-15 | 0.559999 | 0.417268 | 0.793223 | 2722 days |
2008-06-15 | 0.301742 | 0.068523 | 0.645997 | 3088 days |
2009-06-15 | 0.936508 | 1.420025 | -0.793608 | 3453 days |
df['days'] = df['time'].dt.days
df
temperature | CO2 | energy consumption | time | days | |
---|---|---|---|---|---|
date | |||||
2000-06-15 | 0.294403 | -0.484523 | 0.642076 | 166 days | 166 |
2001-06-15 | -1.146592 | 0.824076 | -0.756397 | 531 days | 531 |
2002-06-15 | 1.345931 | -0.446263 | -0.139374 | 896 days | 896 |
2003-06-15 | -0.840468 | -1.695912 | -0.674641 | 1261 days | 1261 |
2004-06-15 | 0.473816 | -1.183667 | -1.006740 | 1627 days | 1627 |
2005-06-15 | 0.957741 | 0.214721 | -0.750404 | 1992 days | 1992 |
2006-06-15 | 1.361329 | 1.388585 | 0.492154 | 2357 days | 2357 |
2007-06-15 | 0.559999 | 0.417268 | 0.793223 | 2722 days | 2722 |
2008-06-15 | 0.301742 | 0.068523 | 0.645997 | 3088 days | 3088 |
2009-06-15 | 0.936508 | 1.420025 | -0.793608 | 3453 days | 3453 |
Generating Date Ranges#
pd.date_range('2014-03-01','2016-07-05') # default frequency = calender day
DatetimeIndex(['2014-03-01', '2014-03-02', '2014-03-03', '2014-03-04',
'2014-03-05', '2014-03-06', '2014-03-07', '2014-03-08',
'2014-03-09', '2014-03-10',
...
'2016-06-26', '2016-06-27', '2016-06-28', '2016-06-29',
'2016-06-30', '2016-07-01', '2016-07-02', '2016-07-03',
'2016-07-04', '2016-07-05'],
dtype='datetime64[ns]', length=858, freq='D')
# custom frequency and number of periods
pd.date_range('2014-04-05 08:00', periods=10, freq='2h30min')
DatetimeIndex(['2014-04-05 08:00:00', '2014-04-05 10:30:00',
'2014-04-05 13:00:00', '2014-04-05 15:30:00',
'2014-04-05 18:00:00', '2014-04-05 20:30:00',
'2014-04-05 23:00:00', '2014-04-06 01:30:00',
'2014-04-06 04:00:00', '2014-04-06 06:30:00'],
dtype='datetime64[ns]', freq='150T')
pd.date_range('2014-03-01','2016-07-05', freq='B') # business day
DatetimeIndex(['2014-03-03', '2014-03-04', '2014-03-05', '2014-03-06',
'2014-03-07', '2014-03-10', '2014-03-11', '2014-03-12',
'2014-03-13', '2014-03-14',
...
'2016-06-22', '2016-06-23', '2016-06-24', '2016-06-27',
'2016-06-28', '2016-06-29', '2016-06-30', '2016-07-01',
'2016-07-04', '2016-07-05'],
dtype='datetime64[ns]', length=612, freq='B')
pd.date_range('2014-03-01','2016-07-05', freq='BMS') # first week day of month
DatetimeIndex(['2014-03-03', '2014-04-01', '2014-05-01', '2014-06-02',
'2014-07-01', '2014-08-01', '2014-09-01', '2014-10-01',
'2014-11-03', '2014-12-01', '2015-01-01', '2015-02-02',
'2015-03-02', '2015-04-01', '2015-05-01', '2015-06-01',
'2015-07-01', '2015-08-03', '2015-09-01', '2015-10-01',
'2015-11-02', '2015-12-01', '2016-01-01', '2016-02-01',
'2016-03-01', '2016-04-01', '2016-05-02', '2016-06-01',
'2016-07-01'],
dtype='datetime64[ns]', freq='BMS')
For more frequency options cf. [Pandas Time series / date functionality ](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
Indexing, Selection, Subsetting#
dates = pd.date_range('2014-03-01','2016-07-05', freq='BMS')
df = pd.DataFrame(np.random.randn(len(dates),3), index = dates, columns = ['A','B','C'])
df.head()
A | B | C | |
---|---|---|---|
2014-03-03 | 0.394580 | -1.494955 | -0.347213 |
2014-04-01 | -0.828459 | -1.064979 | 0.781559 |
2014-05-01 | -0.477372 | 0.553372 | -0.465750 |
2014-06-02 | -2.028449 | 0.365953 | 0.158726 |
2014-07-01 | 0.941145 | -0.978360 | 1.302139 |
df.loc['2014']
A | B | C | |
---|---|---|---|
2014-03-03 | 0.394580 | -1.494955 | -0.347213 |
2014-04-01 | -0.828459 | -1.064979 | 0.781559 |
2014-05-01 | -0.477372 | 0.553372 | -0.465750 |
2014-06-02 | -2.028449 | 0.365953 | 0.158726 |
2014-07-01 | 0.941145 | -0.978360 | 1.302139 |
2014-08-01 | -0.902680 | -0.186172 | -0.919617 |
2014-09-01 | -0.465930 | -0.631799 | 1.320748 |
2014-10-01 | 0.509732 | -1.081103 | 2.057128 |
2014-11-03 | -0.514350 | 0.729604 | -0.561315 |
2014-12-01 | 0.021127 | 0.030578 | -0.437966 |
df.loc['2014-06':'2014-12']
A | B | C | |
---|---|---|---|
2014-06-02 | -2.028449 | 0.365953 | 0.158726 |
2014-07-01 | 0.941145 | -0.978360 | 1.302139 |
2014-08-01 | -0.902680 | -0.186172 | -0.919617 |
2014-09-01 | -0.465930 | -0.631799 | 1.320748 |
2014-10-01 | 0.509732 | -1.081103 | 2.057128 |
2014-11-03 | -0.514350 | 0.729604 | -0.561315 |
2014-12-01 | 0.021127 | 0.030578 | -0.437966 |
df.loc[:'2014-12-01']
A | B | C | |
---|---|---|---|
2014-03-03 | 0.394580 | -1.494955 | -0.347213 |
2014-04-01 | -0.828459 | -1.064979 | 0.781559 |
2014-05-01 | -0.477372 | 0.553372 | -0.465750 |
2014-06-02 | -2.028449 | 0.365953 | 0.158726 |
2014-07-01 | 0.941145 | -0.978360 | 1.302139 |
2014-08-01 | -0.902680 | -0.186172 | -0.919617 |
2014-09-01 | -0.465930 | -0.631799 | 1.320748 |
2014-10-01 | 0.509732 | -1.081103 | 2.057128 |
2014-11-03 | -0.514350 | 0.729604 | -0.561315 |
2014-12-01 | 0.021127 | 0.030578 | -0.437966 |
Shifting (Leading and Lagging) Data#
# frequency M indicates the last calendar day of a month
ts = pd.Series(np.random.randn(4), index = pd.date_range('2000-01-01', periods=4, freq='M'))
ts
2000-01-31 -1.513709
2000-02-29 -0.450101
2000-03-31 0.309443
2000-04-30 -1.272072
Freq: M, dtype: float64
ts.shift(2)
2000-01-31 NaN
2000-02-29 NaN
2000-03-31 -1.513709
2000-04-30 -0.450101
Freq: M, dtype: float64
ts.shift(-2)
2000-01-31 0.309443
2000-02-29 -1.272072
2000-03-31 NaN
2000-04-30 NaN
Freq: M, dtype: float64
If the frequency is passed to shift
the timestamps are advanced.
ts.shift(2, freq='M')
2000-03-31 -1.513709
2000-04-30 -0.450101
2000-05-31 0.309443
2000-06-30 -1.272072
Freq: M, dtype: float64
ts.shift(2, freq='D')
2000-02-02 -1.513709
2000-03-02 -0.450101
2000-04-02 0.309443
2000-05-02 -1.272072
dtype: float64
Links#
For more details and topics like
time spans (periods)
time zone handling
resampling (frequency conversion): downsamplig, upsampling, …
cf. the documentation Pandas Time series / date functionality.