library(caTools)
library(corrgram)
## Warning: package 'corrgram' was built under R version 4.1.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Load Data
data <- read.csv('C:\\Users\\Bobby\\OneDrive\\Documents\\School\\GISC 6323 - Knowledge Mining\\Final Project\\ETH-USD.csv')
head(data)
##         Date     Open     High      Low    Close Adj.Close      Volume
## 1 2021-04-30 2757.734 2796.055 2728.170 2773.207  2773.207 29777179889
## 2 2021-05-01 2772.838 2951.441 2755.908 2945.893  2945.893 28726205272
## 3 2021-05-02 2945.560 2984.892 2860.526 2952.056  2952.056 28032013047
## 4 2021-05-03 2951.176 3450.038 2951.176 3431.086  3431.086 49174290212
## 5 2021-05-04 3431.132 3523.586 3180.743 3253.629  3253.629 62402045158
## 6 2021-05-05 3240.555 3541.463 3213.102 3522.783  3522.783 48334198383
# Visualization of Dataset
glimpse(data)
## Rows: 366
## Columns: 7
## $ Date      <chr> "2021-04-30", "2021-05-01", "2021-05-02", "2021-05-03", "202~
## $ Open      <dbl> 2757.734, 2772.838, 2945.560, 2951.176, 3431.132, 3240.555, ~
## $ High      <dbl> 2796.055, 2951.441, 2984.892, 3450.038, 3523.586, 3541.463, ~
## $ Low       <dbl> 2728.170, 2755.908, 2860.526, 2951.176, 3180.743, 3213.102, ~
## $ Close     <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Adj.Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Volume    <dbl> 29777179889, 28726205272, 28032013047, 49174290212, 62402045~
data$Date = as.Date(data$Date,format = '%Y-%m-%d')
glimpse(data)
## Rows: 366
## Columns: 7
## $ Date      <date> 2021-04-30, 2021-05-01, 2021-05-02, 2021-05-03, 2021-05-04,~
## $ Open      <dbl> 2757.734, 2772.838, 2945.560, 2951.176, 3431.132, 3240.555, ~
## $ High      <dbl> 2796.055, 2951.441, 2984.892, 3450.038, 3523.586, 3541.463, ~
## $ Low       <dbl> 2728.170, 2755.908, 2860.526, 2951.176, 3180.743, 3213.102, ~
## $ Close     <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Adj.Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Volume    <dbl> 29777179889, 28726205272, 28032013047, 49174290212, 62402045~
# Exploratory Data Analysis: 

# Correlation Between Attributes using Corrgram
corrgram(data, lower.panel=panel.shade, upper.panel=panel.cor)

open.close <- melt(data,id.vars='Date', measure.vars=c('Open', 'Close'))

# Box Plot of Open vs Closing Prices
bplot <- ggplot(open.close) +
  geom_boxplot(aes(x=, y=value, color=variable)) 
bplot               

mean(data$Close)
## [1] 3186.631
mean(data$Open)
## [1] 3187.535
summary(data$Open)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1786    2636    3135    3188    3740    4810
summary(data$Close)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1788    2635    3135    3187    3738    4812
# Frequency of Volume
ggplot(data = data) +
  geom_histogram(mapping = aes(x = Volume))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Closing Price vs Volume
ggplot(data = data) +
  geom_hex(mapping = aes(x = Volume, y = Close))

# Random seed training/test
set.seed(50)

# Split and Training/Testing
Split <- sample.split(Y=data$Close + data$Open, SplitRatio=0.7)
training <- subset(x=data, Split==TRUE)
testing <- subset(x=data, Split==FALSE)

# Train Model

model <- lm(formula=Close ~ Open, data=training)
summary(model)
## 
## Call:
## lm(formula = Close ~ Open, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -909.81  -88.48   -3.38   81.88  506.06 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 119.05619   46.99141   2.534   0.0119 *  
## Open          0.96121    0.01431  67.163   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 163.3 on 254 degrees of freedom
## Multiple R-squared:  0.9467, Adjusted R-squared:  0.9465 
## F-statistic:  4511 on 1 and 254 DF,  p-value: < 2.2e-16
# Plot Residuals
resid <- as.data.frame(residuals(model)) 

ggplot(resid, aes(residuals(model))) +
  geom_histogram(fill='seagreen', color='orange')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Predictions
predicts <- predict(model, testing)
predicts
##        1        4        7       16       17       23       24       38 
## 2769.812 2955.750 3507.248 4036.894 3619.613 2460.573 2328.265 2646.792 
##       42       46       47       49       50       51       52       57 
## 2628.907 2530.507 2606.434 2394.530 2401.528 2267.509 2206.164 2031.106 
##       67       70       72       73       75       77       78       80 
## 2350.907 2347.088 2182.770 2148.047 2074.248 2036.389 1961.306 1945.538 
##       88       91       92       93       94       98      100      102 
## 2225.361 2328.885 2409.178 2485.142 2551.357 2738.992 2898.589 3015.066 
##      106      109      111      118      120      133      147      154 
## 3049.781 3300.099 3014.180 3170.190 3095.346 3437.673 3077.630 2860.959 
##      155      159      163      164      165      169      171      172 
## 3003.766 3369.655 3540.955 3555.395 3406.125 3762.183 3800.347 3817.525 
##      178      180      181      183      185      186      187      191 
## 4129.078 4172.795 4090.934 4241.376 4274.113 4240.925 4273.879 4427.813 
##      194      198      199      209      212      215      220      221 
## 4742.535 4604.744 4587.359 4290.734 4005.219 4394.287 4078.877 4155.169 
##      227      235      236      238      248      251      252      256 
## 4045.410 3890.544 3904.739 3946.550 3742.136 3766.138 3531.065 3154.138 
##      258      260      261      263      266      268      277      278 
## 3231.880 3241.683 3300.505 3340.014 3094.256 2580.850 2621.334 2702.686 
##      282      283      284      286      292      293      295      298 
## 2987.730 3017.060 3057.875 3119.162 2938.980 3176.127 2891.507 2644.789 
##      300      305      308      309      310      311      313      318 
## 2656.114 2638.548 2954.771 2844.069 2635.954 2680.621 2519.886 2592.714 
##      322      329      330      336      338      345      346      347 
## 2783.490 3032.536 3106.922 3373.023 3435.021 3187.209 3253.836 3204.127 
##      351      352      355      359      361      364 
## 3022.034 3042.208 3058.017 2968.848 2928.658 2895.842
# Create dataframe of actual/predicted values
mod_eval <- cbind(testing$Close, predicts)
colnames(mod_eval) <- c('Actual', 'Predicted')
mod_eval <- as.data.frame(mod_eval)
head(mod_eval)
##      Actual Predicted
## 1  2773.207  2769.812
## 4  3431.086  2955.750
## 7  3490.880  3507.248
## 16 3638.122  4036.894
## 17 3587.506  3619.613
## 23 2295.706  2460.573
# Merge Date Column to new Dataframe
mod_eval2 <- cbind(testing$Date, mod_eval)
colnames(mod_eval2) <- c('Date','Actual', 'Predicted')
head(mod_eval2)
##          Date   Actual Predicted
## 1  2021-04-30 2773.207  2769.812
## 4  2021-05-03 3431.086  2955.750
## 7  2021-05-06 3490.880  3507.248
## 16 2021-05-15 3638.122  4036.894
## 17 2021-05-16 3587.506  3619.613
## 23 2021-05-22 2295.706  2460.573
# Melt Predicted and Actual together for visualization
modeval_Melted <- reshape2::melt(mod_eval2, id.var='Date')
head(modeval_Melted)
##         Date variable    value
## 1 2021-04-30   Actual 2773.207
## 2 2021-05-03   Actual 3431.086
## 3 2021-05-06   Actual 3490.880
## 4 2021-05-15   Actual 3638.122
## 5 2021-05-16   Actual 3587.506
## 6 2021-05-22   Actual 2295.706
# Line Graph Actual vs. Predicted
lineplot <- ggplot(modeval_Melted, aes(x=Date, y=value, col=variable)) + geom_line(aes(group=1), size=1) 
lineplot

# could not get GGplot2 to cooperate so exported to csv and visualized in excel
write.csv(mod_eval)
## "","Actual","Predicted"
## "1",2773.207031,2769.81236802604
## "4",3431.086182,2955.75003442064
## "7",3490.880371,3507.24806701246
## "16",3638.12207,4036.89363590559
## "17",3587.506104,3619.61319192519
## "23",2295.705566,2460.57301909215
## "24",2109.579834,2328.26517588701
## "38",2715.092773,2646.79182057118
## "42",2471.518555,2628.90739085086
## "46",2537.891113,2530.50653560699
## "47",2610.936768,2606.43445968221
## "49",2372.001953,2394.53009586992
## "50",2231.733154,2401.52818819939
## "51",2178.499023,2267.50913133833
## "52",2246.364502,2206.16359617072
## "57",1813.217285,2031.1063999532
## "67",2198.58252,2350.90706916354
## "70",2120.026367,2347.0875812346
## "72",2111.403564,2182.76958548948
## "73",2139.664795,2148.04688371692
## "75",1940.083984,2074.24788546914
## "77",1911.175659,2036.38940688082
## "78",1880.382935,1961.30594250409
## "80",1895.552124,1945.53823511412
## "88",2233.366699,2225.36076883769
## "91",2380.956787,2328.88493879148
## "92",2466.961426,2409.17772388862
## "93",2536.209961,2485.14249202866
## "94",2561.852051,2551.35742823305
## "98",2827.328857,2738.99175847769
## "100",3157.23877,2898.5886647007
## "102",3167.856201,3015.06614417074
## "106",3322.21167,3049.78063242073
## "109",3156.509521,3300.0992895171
## "111",3020.089844,3014.17979598772
## "118",3224.915283,3170.18977183505
## "120",3270.60083,3095.34649219725
## "133",3427.340088,3437.67336176378
## "147",3155.523682,3077.6298509498
## "154",3001.678955,2860.95863738495
## "155",3307.516113,3003.7658490869
## "159",3518.518555,3369.65451780756
## "163",3575.716797,3540.95464790651
## "164",3425.852783,3555.39459251324
## "165",3545.354004,3406.12504084368
## "169",3862.634766,3762.18291699914
## "171",3847.104492,3800.34728754579
## "172",3748.760254,3817.5251255571
## "178",4087.903076,4129.07761583646
## "180",4131.102051,4172.79521580045
## "181",3930.257324,4090.93436591376
## "183",4414.746582,4241.37609346835
## "185",4288.074219,4274.11254921032
## "186",4324.626953,4240.925058137
## "187",4584.798828,4273.87881802207
## "191",4521.581055,4427.81337272547
## "194",4735.068848,4742.53531478254
## "198",4651.460449,4604.74417292752
## "199",4626.358887,4587.35935606972
## "209",4239.981445,4290.73375069301
## "212",4096.912109,4005.21929676855
## "215",4631.479004,4394.28654926663
## "220",4198.322754,4078.87655528093
## "221",4358.737305,4155.16868780537
## "227",4134.453125,4045.40957278891
## "235",3933.844482,3890.54408234582
## "236",4020.26001,3904.73926590222
## "238",4108.015625,3946.54970322589
## "248",3829.564941,3742.1360037568
## "251",3550.386963,3766.13804387614
## "252",3418.408203,3531.06495251353
## "256",3083.0979,3154.13834947831
## "258",3372.258301,3231.88027206721
## "260",3310.001465,3241.68337415162
## "261",3330.530762,3300.5050336875
## "263",3212.304932,3340.01359406254
## "266",3001.120117,3094.25621604201
## "268",2405.181152,2580.84957139389
## "277",2688.278809,2621.33388863883
## "278",2792.117188,2702.68620853615
## "282",3014.648193,2987.72967917911
## "283",3057.476074,3017.05966558488
## "284",3142.470703,3057.87486715801
## "286",3239.457031,3119.16196952856
## "292",3179.877197,2938.98005235376
## "293",3127.830078,3176.12715572934
## "295",2785.727539,2891.50749900645
## "298",2573.816162,2644.7893820302
## "300",2590.359619,2656.11384861225
## "305",2919.201172,2638.54786614947
## "308",2834.468994,2954.77052211098
## "309",2617.156006,2844.06873884967
## "310",2664.831055,2635.95405975029
## "311",2555.037354,2680.6213688079
## "313",2576.747559,2519.8860789292
## "318",2518.94458,2592.71354385514
## "322",2814.854492,2783.49033900097
## "329",3108.062012,3032.53614712857
## "330",3106.671387,3106.92229064769
## "336",3281.642822,3373.02343947467
## "338",3445.059326,3435.02065253681
## "345",3261.91626,3187.20920853105
## "346",3211.866943,3253.83575611963
## "347",2981.052246,3204.12726757784
## "351",3040.916504,3022.03372871869
## "352",3062.310303,3042.20830288249
## "355",3104.106445,3058.0173124201
## "359",2938.114014,2968.84790193421
## "361",3009.393555,2928.65786044861
## "364",2936.940918,2895.84185320806
#rme and rmse
mse <- mean((mod_eval$Actual - mod_eval$Predicted)^2)
mse
## [1] 18205.93
rmse <- sqrt(mse)
rmse
## [1] 134.9294