Data Wrangling in R

On This Page

In data science, clean data is half the battle won.

Data Wrangling is the process of cleaning, organizing, transforming, and preparing raw data into a structured and usable format for analysis. It involves handling missing values, correcting inconsistencies, filtering irrelevant information, and reshaping datasets to ensure accuracy and consistency.

Importing Data in R

Import data from various sources including Text, CSV, Excel, Database, Google Sheets, and Financial APIs

Text File

# Importing a text file
data_txt <- read.table("data.txt", header = TRUE)
# Importing a tab-delimited file
data_tab <- read.delim("data.txt", header = TRUE)

CSV File

library(readr)
dataset <- read_csv(data.csv)

Excel File

library(readxl)
dataset <- read_excel(data.xlsx) 

Database

library(DBI)
con <- dbConnect(RMySQL::MySQL(),
                 dbname = "dbname",
                 host = "localhost",
                 user = "username",
                 password = "password")
# Read a table 
data <- dbReadTable(con, "table_name")

Google Sheets

library(googlesheets4)
data <- read_sheet("https://docs.google.com/spreadsheets/d/your_sheet_id")

Financial APIs

library(quantmod) 
getSymbols("AAPL", src = "yahoo")     # Apple stock 

Exploring the Dataset

View(df) # View complete data frame

Structure

head(df)       # First 6 rows
tail(df)       # Last 6 rows
dim(df)        # Dimensions (rows, columns)
str(df)        # Structure of the dataset
summary(df)    # Summary statistics
colnames(df)   # Column names

Data Types

sapply(df, class) # Class of each column
typeof(df$column_name) # Internal type of a column

Handling Missing Values

is.na(df)  # Logical matrix showing NA values
sum(is.na(df))  # Total number of missing values
colSums(is.na(df))  # Number of missing values per column
rowSums(is.na(df))  # Number of missing values per row
anyNA(df)  # Returns TRUE if any NA exists

Remove Missing Values

df <- na.omit(df)         # Remove rows with missing values

Replacing Missing Values

# Replace with a specific value (e.g., 0)
df$column_name[is.na(df$column_name)] <- 0
# Replace with the mean (for numeric columns)
df$column_name[is.na(df$column_name)] <- mean(df$column_name, na.rm = TRUE)
# Replace with the median (for numeric columns)
df$column_name[is.na(df$column_name)] <- median(df$column_name, na.rm = TRUE)

Handling Outliers

There are multiple ways to detect and remove outliers. The best method depends on the dataset and context. IQR and Z-score methods are widely used.

Visualizing Outliers with Boxplots

boxplot(df$column_name, main = "Boxplot of Column", col = "red")

Using the IQR

# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 <- quantile(df$column_name, 0.25, na.rm = TRUE)
Q3 <- quantile(df$column_name, 0.75, na.rm = TRUE)
# Calculate IQR
IQR <- Q3 - Q1
# Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter out outliers
df_clean <- df[df$column_name >= lower_bound & df$column_name <= upper_bound, ]

Using the Z-score

# Compute Z-scores
df$z_score <- (df$column_name - mean(df$column_name, na.rm = TRUE)) / sd(df$column_name, na.rm = TRUE)
# Remove outliers
df_clean <- df[abs(df$z_score) <= 3, ]

Using Boxplot Stats

# Identify outliers using boxplot stats
outliers <- boxplot.stats(df$column_name)$out
# Remove outliers
df_clean <- df[!df$column_name %in% outliers, ]

Renaming Columns

# Rename all columns by vector of names
colnames(df) <- c("new_name1", "new_name2", "new_name3")
# Change only the second column's name
colnames(df)[2] <- "new_column_name"

Filtering and Subsetting

Subsetting Rows

# Select rows where column value is greater than 50
df_subset <- df[df$column_name > 50, ]
# Select rows where column equals a specific value
df_subset <- df[df$gender == "Male", ]
# Multiple conditions using logical AND (&) and OR (|)
df_subset <- df[df$age > 25 & df$gender == "Female", ]
df_subset <- df[df$age > 25 | df$income > 50000, ]
# Exclude rows where age is less than 25
df_subset <- df[!(df$age < 25), ]
# Keep rows where column is not NA
df_subset <- df[!is.na(df$income), ]

Subsetting Specific Columns

# Select specific columns
df_subset <- df[, c("name", "age", "gender")]
# Select a column using the $ operator
gender_vector <- df$gender

Creating New Variables (Mutating)

# Create a new column: total price (unit price * quantity)
df$total_price <- df$unit_price * df$quantity
# Create a categorical column based on a condition
df$income_group <- ifelse(df$income > 50000, "High", "Low")

Normalization and Standardization

Normalization (Min-Max Scaling)

\[X_{norm} = \frac{X - \min(X)}{\max(X) - \min(X)}\]

# Normalize a column using min-max scaling
df$normalized_col <- (df$column_name - min(df$column_name, na.rm = TRUE)) /
                     (max(df$column_name, na.rm = TRUE) - min(df$column_name, na.rm = TRUE))

Standardization (Z-score Scaling)

\[X_{std} = \frac{X - \mu}{\sigma}\]

# Standardize a column using z-score
df$standardized_col <- (df$column_name - mean(df$column_name, na.rm = TRUE)) /
                       sd(df$column_name, na.rm = TRUE)

Grouping

Grouping is used to compute summary statistics for each group in a dataset for example, mean income by gender, sales by region, or count by category.

# Mean income by gender
aggregate(income ~ gender, data = df, FUN = mean)
# Multiple summary functions
aggregate(cbind(income, expenditure) ~ gender, data = df, FUN = mean)
# Sum of sales by region
aggregate(sales ~ region, data = df, FUN = sum)

Merging and Joining Datasets

# Inner Join (default)
merged_data <- merge(df1, df2, by = "id")
# Left Join
merged_data <- merge(df1, df2, by = "id", all.x = TRUE)
# Right Join
merged_data <- merge(df1, df2, by = "id", all.y = TRUE)
# Full Join
merged_data <- merge(df1, df2, by = "id", all = TRUE)
# Join by different column names
merged_data <- merge(df1, df2, by.x = "roll_no", by.y = "id")

Dataset df1

id	name
1	A
2	B
3	C

Dataset df2

id	score
2	80
3	90
4	85

Inner Join

Only matching ids in both df1 and df2 by = “id”

id	name	score
2	B	80
3	C	90

Left Join

All records from df1, matched values from df2. Unmatched gets NA by = “id” all.x = TRUE

id	name	score
1	A	NA
2	B	80
3	C	90

Right Join

All records from df2, matched values from df1. Unmatched gets NA by = “id” all.y = TRUE

id	name	score
2	B	80
3	C	90
4	NA	85

Full Join

All records from both datasets by = “id” all = TRUE

id	name	score
1	A	NA
2	B	80
3	C	90
4	NA	85

Join by Different Column Names

Join columns with different names using by.x and by.y (merge(df1, df2, by.x = "roll_no", by.y = "id"))

Assume df1 is:

roll_no	name
1	A
2	B
3	C

Join with:

id	score
2	80
3	90

Result:

roll_no	name	score
2	B	80
3	C	90

flowchart LR
  subgraph DF1
    A1[roll_no: 1\nname: A]
    A2[roll_no: 2\nname: B]
    A3[roll_no: 3\nname: C]
  end
  subgraph DF2
    B1[id: 2\nscore: 80]
    B2[id: 3\nscore: 90]
  end
  subgraph RESULT_MERGE_by_roll_no
    R1[roll_no: 2\nname: B\nscore: 80]
    R2[roll_no: 3\nname: C\nscore: 90]
  end
  A2 --> R1
  B1 --> R1
  A3 --> R2
  B2 --> R2