DataFrames
Creating DataFrames
import pandas as pd df = pd.DataFrame({ "name": ["Alice", "Bob", "Carol"], "age": [25, 30, 35], "score": [88, 92, 79] })
Inspection
df.head(n)First n rows (default 5)
df.tail(n)Last n rows
df.shapeTuple of (rows, columns)
df.dtypesData type of each column
df.info()Column types, non-null counts
df.describe()Statistics for numeric columns
df.columnsColumn names as Index
df.indexRow labels
Reading Data
Common Readers
df = pd.read_csv("data.csv") df = pd.read_excel("data.xlsx") df = pd.read_json("data.json") df = pd.read_sql(query, connection)
Writing Data
df.to_csv("out.csv", index=False) df.to_excel("out.xlsx", index=False) df.to_json("out.json", orient="records")
Read Options
sep=";"Custom delimiter
header=NoneNo header row in file
usecols=[0,2]Read specific columns only
nrows=100Read first 100 rows
na_values=["N/A"]Treat as NaN
Selection
Columns
df["name"] # single column (Series) df[["name", "age"]] # multiple columns (DataFrame) df.name # attribute access (simple names)
Rows with loc / iloc
df.loc[0] # row by label df.loc[0:2, "name"] # rows 0-2, column "name" df.iloc[0] # row by position df.iloc[0:2, 0:2] # first 2 rows, 2 cols
loc vs iloc
df.loc[row, col]Select by **label** (inclusive end)
df.iloc[row, col]Select by **position** (exclusive end)
df.at[row, col]Fast scalar access by label
df.iat[row, col]Fast scalar access by position
Filtering
Boolean Filtering
df[df["age"] > 25] df[df["name"].str.contains("li")] df[(df["age"] > 25) & (df["score"] > 80)] df[df["name"].isin(["Alice", "Bob"])]
Handling Missing Data
df.isna().sum() # NaN count per column df.dropna() # drop rows with any NaN df.fillna(0) # fill NaN with 0 df["col"].fillna(df["col"].mean())
Sorting
df.sort_values("age") # ascending df.sort_values("age", ascending=False) df.sort_values(["age", "score"]) # multi
Aggregation
Common Aggregations
df["col"].sum()Sum of column
df["col"].mean()Mean
df["col"].median()Median
df["col"].std()Standard deviation
df["col"].min() / .max()Min / max
df["col"].count()Non-null count
df["col"].nunique()Number of unique values
df["col"].value_counts()Frequency of each value
Multiple Aggregations
df.agg({"age": "mean", "score": ["min", "max"]}) df.describe() # summary stats for all numeric
GroupBy
Basic Grouping
df.groupby("dept")["salary"].mean() df.groupby("dept").agg( avg_sal=("salary", "mean"), count=("salary", "count") )
Multiple Groups
df.groupby(["dept", "year"])["sales"].sum() df.groupby("dept").size() # rows per group
Transform & Apply
df["z_score"] = df.groupby("dept")["salary"] \ .transform(lambda x: (x - x.mean()) / x.std()) df.groupby("dept").apply(lambda g: g.nlargest(3, "salary"))
Merging
Merge (SQL-style Join)
pd.merge(df1, df2, on="id") # inner pd.merge(df1, df2, on="id", how="left") pd.merge(df1, df2, left_on="uid", right_on="user_id")
Join Types
how="inner"Keep only matching rows (default)
how="left"Keep all left rows, NaN for no match
how="right"Keep all right rows
how="outer"Keep all rows from both sides
Concatenation
pd.concat([df1, df2]) # stack rows pd.concat([df1, df2], axis=1) # side by side pd.concat([df1, df2], ignore_index=True)
Pivot Tables
Pivot Table
df.pivot_table( values="sales", index="region", columns="quarter", aggfunc="sum" )
Reshaping
df.melt(id_vars=["name"], value_vars=["q1", "q2"], var_name="quarter", value_name="sales")
Cross Tabulation
pd.crosstab(df["dept"], df["gender"]) pd.crosstab(df["dept"], df["gender"], normalize="index") # row percentages
Time Series
DateTime Basics
df["date"] = pd.to_datetime(df["date"]) df["year"] = df["date"].dt.year df["month"] = df["date"].dt.month df["weekday"] = df["date"].dt.day_name()
Date Ranges & Resampling
pd.date_range("2025-01-01", periods=12, freq="ME") df.set_index("date").resample("ME")["sales"].sum()

freq="ME" requires pandas ≥ 2.2. Use "M" on older versions.

Accessor Attributes
.dt.year / .dt.month / .dt.dayExtract date components
.dt.hour / .dt.minuteExtract time components
.dt.day_name()Weekday name (Monday, etc.)
.dt.days_in_monthDays in that month
Common Patterns
Rename Columns
df.rename(columns={"old": "new"}) df.columns = ["a", "b", "c"] # replace all
Add / Modify Columns
df["total"] = df["q1"] + df["q2"] df["grade"] = df["score"].apply( lambda x: "A" if x >= 90 else "B" )
Drop Columns / Rows
df.drop(columns=["temp"]) df.drop_duplicates(subset=["name"]) df.reset_index(drop=True)
String Operations
df["name"].str.lower() df["name"].str.contains("ali", case=False) df["name"].str.split(" ").str[0] # first name