item_format.R

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(stringr)

# set the working directory
setwd("C:/Users/Bobby/OneDrive/Documents/School/Information Management/Data")

# Define the list of csv files
csv_files <- c("heb_bakerybread.csv", "heb_dairy.csv", "heb_meat_seafood.csv", "heb_pantry.csv", "heb_produce.csv")

# Create an empty list to hold the dataframes
dfs <- list()

# Loop through each file
for (csv_file in csv_files) {
  # Read the csv file into a dataframe
  df <- read_csv(csv_file)
  
  # Remove any columns with "url" in the name
  df <- select(df, -contains("item_price_url"))
  
  # Create a "scrape_date" column and set the value to "4/17/2023"
  df$scrape_date <- "4/17/2023"
  
  # Create a "category" column and set the value based on the filename
  category <- str_replace_all(str_remove(csv_file, "\\.csv"), "heb_", "")
  df$category <- category
  
  # Generate a unique ID for each item
  df$item_id <- paste0(str_sub(category, start = 1, end = 1), seq_along(df$item_name))
  
  # Reorder the columns to make "item_id" the first column
  df <- select(df, item_id, everything())
  
  # Append the dataframe to the list
  dfs[[csv_file]] <- df
}

## Rows: 930 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): item_name, item_url, item_price, item_price_url

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 1543 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): item_name, item_url, item_price, item_price_url

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 1073 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): item_name, item_url, item_price, item_price_url

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 10006 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): item_name, item_url, item_price, item_price_url

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 614 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): item_name, item_url, item_price, item_price_url

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Combine all dataframes into one
combined_df <- bind_rows(dfs)

# Write the combined dataframe to a new csv file
write_csv(combined_df, "heb_combined.csv")

item_format.R

Bobby

2023-05-03