Pandas Tricks


Create dataframe

# create DataFrame 1
>>> import pandas
>>> df = pandas.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
>>> df
   x  y
0  1  4
1  2  5
2  3  6

Converting a DataFrame to a Numpy Array

df.values
or table presentation
df.as_matrix

Iterate dataframe

1.
>>> z = []
>>> for index, row in df.iterrows():
...     z.append(row.x + row.y)
... 
>>> z
[5, 7, 9]

2.We could also write it using a list comprehension
z = [ row.x + row.y for index, row in df.iterrows() ]
We could then assign this list to our new column.

>>> df['z'] = z
>>> df
   x  y  z
0  1  4  5
1  2  5  7
2  3  6  9

Creating a new Column by Looping

topping_type = []

for row in df.topping:
    if row in ['pepperoni', 'chicken', 'anchovies']:
        topping_type.append('meat')
    else:
        topping_type.append('vegetable')

df['topping_type'] = topping_type

To apply a function to all columns or rows

>>> df = pandas.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
>>> df.apply(lambda row: row.x + row.y, axis=1)
0    5
1    7
2    9 
dtype: int64

or
>>> df['z'] = df.apply(lambda row: row.x + row.y, axis=1)
>>> df
   x  y  z
0  1  4  5
1  2  5  7
2  3  6  9


Vectorized operations

>>> df = pandas.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
>>> df.x
0    1
1    2
2    3
Name: x, dtype: int64

>>> df['z'] = df.x + df.y
>>> df
   x  y  z
0  1  4  5
1  2  5  7
2  3  6  9

Means/Std of dataframe

df['average'] = df.mean(axis=1)
       salary_1  salary_2  salary_3     average
0       230       235       210  225.000000
1       345       375       385  368.333333
2       222       292       260  258.000000

or some of it
df['average_1_3'] = df[['salary_1', 'salary_3']].mean(axis=1)
   salary_1  salary_2  salary_3  average_1_3
0       230       235       210        220.0
1       345       375       385        365.0
2       222       292       260        241.0

another method
col = df.loc[: , "salary_1":"salary_3"]
df['salary_mean'] = col.mean(axis=1)

Sum of dataframe

df['C'] = df.sum(axis=1,numeric_only=True)
or
df['C'] = df['A'] + df['B']
or
f['C'] = df.apply(lambda row: row['A'] + row['B'], axis=1)
or
df['C'] =  df[['A', 'B']].sum(axis=1)
or
df['sum'] = df[list(df.columns)].sum(axis=1)
or
df.assign(C = df.A + df.B,
          Diff = df.B - df.A,
          Mult = df.A * df.B)
# Out[379]: 
#    A  B   C  Diff  Mult
# 0  1  4   5     3     4
# 1  2  6   8     4    12
# 2  3  9  12     6    27

Importing a csv file/ Loading Massive Datasets in Smaller Chunks

df = pd.read_csv('pizza.csv')
#Need to parse dates? Just pass in the corresponding column name(s).

df = pd.read_csv('pizza.csv', parse_dates=['dates'])
#Only need a few specific columns?

df = pd.read_csv('pizza.csv', usecols=['foo', 'bar'])




chunksize = 500
chunks = []
for chunk in pd.read_csv('pizza.csv', chunksize=chunksize):
    # Do stuff...
    chunks.append(chunk)

df = pd.concat(chunks, axis=0)

Exploring Data in a DataFrame

df.head()       # first five rows
df.tail()       # last five rows
df.sample(5)    # random sample of rows
df.shape        # number of rows/columns in a tuple
df.describe()   # calculates measures of central tendency
df.info()       # memory footprint and datatypes

Adding a New Column to a DataFrame

#Need the column in a certain order? The first argument is the position of the column. This will put the column at the begining of the DataFrame.
df.insert(0, 'original_price', full_price)

Select a Specific “Cell” Value

df.loc[rowindex, 'A'] 
df.ix[2,'A']
df.iloc[row]["A"]

Filtering dataframes with conditional logic

filtered_data = df[(df.price > 11.99) & (df.topping == 'Pineapple')]

Sorting a DataFrame by a Certain Column

df.sort_values('price', axis=0, ascending=False)
df.sort_values(['avg',"ID"]) #first sort avg, and ID

Set output precesion

pandas.set_option('precision', 2)

Apply a Function to Every Row in a Column

def calculate_taxes(price):
    taxes = price * 0.12
    return taxes

df['taxes'] = df.price.apply(calculate_taxes)

Add a New Column with Conditional Logic

df['profitable'] = np.where(df['price']>=15.00, True, False)

Combining DataFrames with Concatenation

pd.concat([df_1, df_2], axis=0)
#Or to concat columns horizontally:

pd.concat([df_1, df_2], axis=1)

Combining DataFrames based on an Index Key

#Merging in Pandas works just like SQL. If you you have two DataFrames that share a key, perhaps a pizza ‘order_id’, you can perform inner, outer, left, right joins just like you would in SQL.

merged_df = df_1.merge(df_2, how='left', on='order_id')

Converting Dates to their own Day, Week, Month, Year Columns

#make sure the data is in datetime format. Then use dt method to extract the data you need.

date = pd.to_datetime(df.date)
df['weekday'] = date.dt.weekday
df['year'] = date.dt.year

Finding NaNs in a DataFrame

#Count the total number of NaNs present:

df.isnull().sum().sum()
#List the NaN count for each column:

df.isnull().sum()

Filling NaNs or Missing Data

df.topping = df.topping.fillna('Cheese')
# or we can drop any row missing data across the entire DataFrame:

df = df.dropna(axis=0)

Extracting Features by Grouping Columns

#Grouping columns is a great way to extract features from data. This is especially useful when you have data that can be counted or quantified in some way. For example, you might have group pizzas by topping, then calculate the mean for price in each group.

df.groupby('topping')['discount'].apply(lambda x: np.mean(x))
#or maybe you want to see the count of a certain value

df.groupby('topping')['discount'].apply(lambda x: x.count())

topping
Anchovies      3
Bell Pepper    1
Cheese         2
Olives         1
Pepperoni      3
Pineapple      2
Veggie         1
Name: discount, dtype: int64

Creating Bins

#Let’s say we want to create 3 separate bins for different price ranges. This is especially useful for simplifying noisy data.

bins = [0, 5, 15, 30]
names = ['Cheap', 'Normal', 'Expensive']

df['price_point'] = pd.cut(df.price, bins, labels=names)

 order_numbe	price	price_point
0	PZZA0000	12.99	Normal
1	PZZA0001	14.50	Normal
2	PZZA0002	19.99	Expensive
3	PZZA0003	20.99	Expensive
4	PZZA0004	21.99	Expensive

Square sum


Delete unique data

df_train.drop(df_train[df_train['GrLivArea']>=4000].index,inplace=True)

Select some columns

df_train_shrinked=df_train.loc[:,['GrLivArea','YearBuilt','OverallCond','BsmtQual']]
df_x=df_train_shrinked

GrLivArea	YearBuilt	OverallCond	BsmtQual
0	1710	2003	5	Gd
1	1262	1976	8	Gd
2	1786	2001	5	Gd
3	1717	1915	5	TA
4	2198	2000	5	Gd

Data to value

df_x=pd.get_dummies(df_x)

Column change types

df.x = df.x.astype(float)

MinMaxscaler by sklearn

df[["x"]] = scaler.fit_transform(df[["x"]])

Group sum

group by mm, and sum by k

x = x.groupby('mm')['k'].sum().resun_index()

Date make

df = pd.DataFrame({'value': range(1,32,2)},
     index=pd.date_range('2019-01-01', '2019-01-31',freq='2D'))  # "W" week

Change a column from category to data by dict

dictA = {"m":1,...}
df['x'] = df['x'].apply(dictA.get)

Welcome to share or comment on this post:

Table of Contents