library(caTools)
library(corrgram)
## Warning: package 'corrgram' was built under R version 4.1.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data <- read.csv('C:\\Users\\Bobby\\OneDrive\\Documents\\School\\GISC 6323 - Knowledge Mining\\Final Project\\ETH-USD.csv')
head(data)
## Date Open High Low Close Adj.Close Volume
## 1 2021-04-30 2757.734 2796.055 2728.170 2773.207 2773.207 29777179889
## 2 2021-05-01 2772.838 2951.441 2755.908 2945.893 2945.893 28726205272
## 3 2021-05-02 2945.560 2984.892 2860.526 2952.056 2952.056 28032013047
## 4 2021-05-03 2951.176 3450.038 2951.176 3431.086 3431.086 49174290212
## 5 2021-05-04 3431.132 3523.586 3180.743 3253.629 3253.629 62402045158
## 6 2021-05-05 3240.555 3541.463 3213.102 3522.783 3522.783 48334198383
glimpse(data)
## Rows: 366
## Columns: 7
## $ Date <chr> "2021-04-30", "2021-05-01", "2021-05-02", "2021-05-03", "202~
## $ Open <dbl> 2757.734, 2772.838, 2945.560, 2951.176, 3431.132, 3240.555, ~
## $ High <dbl> 2796.055, 2951.441, 2984.892, 3450.038, 3523.586, 3541.463, ~
## $ Low <dbl> 2728.170, 2755.908, 2860.526, 2951.176, 3180.743, 3213.102, ~
## $ Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Adj.Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Volume <dbl> 29777179889, 28726205272, 28032013047, 49174290212, 62402045~
data$Date = as.Date(data$Date,format = '%Y-%m-%d')
glimpse(data)
## Rows: 366
## Columns: 7
## $ Date <date> 2021-04-30, 2021-05-01, 2021-05-02, 2021-05-03, 2021-05-04,~
## $ Open <dbl> 2757.734, 2772.838, 2945.560, 2951.176, 3431.132, 3240.555, ~
## $ High <dbl> 2796.055, 2951.441, 2984.892, 3450.038, 3523.586, 3541.463, ~
## $ Low <dbl> 2728.170, 2755.908, 2860.526, 2951.176, 3180.743, 3213.102, ~
## $ Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Adj.Close <dbl> 2773.207, 2945.893, 2952.056, 3431.086, 3253.629, 3522.783, ~
## $ Volume <dbl> 29777179889, 28726205272, 28032013047, 49174290212, 62402045~
corrgram(data, lower.panel=panel.shade, upper.panel=panel.cor)

open.close <- melt(data,id.vars='Date', measure.vars=c('Open', 'Close'))
bplot <- ggplot(open.close) +
geom_boxplot(aes(x=, y=value, color=variable))
bplot

mean(data$Close)
## [1] 3186.631
mean(data$Open)
## [1] 3187.535
summary(data$Open)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1786 2636 3135 3188 3740 4810
summary(data$Close)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1788 2635 3135 3187 3738 4812
ggplot(data = data) +
geom_histogram(mapping = aes(x = Volume))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data) +
geom_hex(mapping = aes(x = Volume, y = Close))

set.seed(50)
Split <- sample.split(Y=data$Close + data$Open, SplitRatio=0.7)
training <- subset(x=data, Split==TRUE)
testing <- subset(x=data, Split==FALSE)
model <- lm(formula=Close ~ Open, data=training)
summary(model)
##
## Call:
## lm(formula = Close ~ Open, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -909.81 -88.48 -3.38 81.88 506.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.05619 46.99141 2.534 0.0119 *
## Open 0.96121 0.01431 67.163 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 163.3 on 254 degrees of freedom
## Multiple R-squared: 0.9467, Adjusted R-squared: 0.9465
## F-statistic: 4511 on 1 and 254 DF, p-value: < 2.2e-16
resid <- as.data.frame(residuals(model))
ggplot(resid, aes(residuals(model))) +
geom_histogram(fill='seagreen', color='orange')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

predicts <- predict(model, testing)
predicts
## 1 4 7 16 17 23 24 38
## 2769.812 2955.750 3507.248 4036.894 3619.613 2460.573 2328.265 2646.792
## 42 46 47 49 50 51 52 57
## 2628.907 2530.507 2606.434 2394.530 2401.528 2267.509 2206.164 2031.106
## 67 70 72 73 75 77 78 80
## 2350.907 2347.088 2182.770 2148.047 2074.248 2036.389 1961.306 1945.538
## 88 91 92 93 94 98 100 102
## 2225.361 2328.885 2409.178 2485.142 2551.357 2738.992 2898.589 3015.066
## 106 109 111 118 120 133 147 154
## 3049.781 3300.099 3014.180 3170.190 3095.346 3437.673 3077.630 2860.959
## 155 159 163 164 165 169 171 172
## 3003.766 3369.655 3540.955 3555.395 3406.125 3762.183 3800.347 3817.525
## 178 180 181 183 185 186 187 191
## 4129.078 4172.795 4090.934 4241.376 4274.113 4240.925 4273.879 4427.813
## 194 198 199 209 212 215 220 221
## 4742.535 4604.744 4587.359 4290.734 4005.219 4394.287 4078.877 4155.169
## 227 235 236 238 248 251 252 256
## 4045.410 3890.544 3904.739 3946.550 3742.136 3766.138 3531.065 3154.138
## 258 260 261 263 266 268 277 278
## 3231.880 3241.683 3300.505 3340.014 3094.256 2580.850 2621.334 2702.686
## 282 283 284 286 292 293 295 298
## 2987.730 3017.060 3057.875 3119.162 2938.980 3176.127 2891.507 2644.789
## 300 305 308 309 310 311 313 318
## 2656.114 2638.548 2954.771 2844.069 2635.954 2680.621 2519.886 2592.714
## 322 329 330 336 338 345 346 347
## 2783.490 3032.536 3106.922 3373.023 3435.021 3187.209 3253.836 3204.127
## 351 352 355 359 361 364
## 3022.034 3042.208 3058.017 2968.848 2928.658 2895.842
mod_eval <- cbind(testing$Close, predicts)
colnames(mod_eval) <- c('Actual', 'Predicted')
mod_eval <- as.data.frame(mod_eval)
head(mod_eval)
## Actual Predicted
## 1 2773.207 2769.812
## 4 3431.086 2955.750
## 7 3490.880 3507.248
## 16 3638.122 4036.894
## 17 3587.506 3619.613
## 23 2295.706 2460.573
mod_eval2 <- cbind(testing$Date, mod_eval)
colnames(mod_eval2) <- c('Date','Actual', 'Predicted')
head(mod_eval2)
## Date Actual Predicted
## 1 2021-04-30 2773.207 2769.812
## 4 2021-05-03 3431.086 2955.750
## 7 2021-05-06 3490.880 3507.248
## 16 2021-05-15 3638.122 4036.894
## 17 2021-05-16 3587.506 3619.613
## 23 2021-05-22 2295.706 2460.573
modeval_Melted <- reshape2::melt(mod_eval2, id.var='Date')
head(modeval_Melted)
## Date variable value
## 1 2021-04-30 Actual 2773.207
## 2 2021-05-03 Actual 3431.086
## 3 2021-05-06 Actual 3490.880
## 4 2021-05-15 Actual 3638.122
## 5 2021-05-16 Actual 3587.506
## 6 2021-05-22 Actual 2295.706
lineplot <- ggplot(modeval_Melted, aes(x=Date, y=value, col=variable)) + geom_line(aes(group=1), size=1)
lineplot

write.csv(mod_eval)
## "","Actual","Predicted"
## "1",2773.207031,2769.81236802604
## "4",3431.086182,2955.75003442064
## "7",3490.880371,3507.24806701246
## "16",3638.12207,4036.89363590559
## "17",3587.506104,3619.61319192519
## "23",2295.705566,2460.57301909215
## "24",2109.579834,2328.26517588701
## "38",2715.092773,2646.79182057118
## "42",2471.518555,2628.90739085086
## "46",2537.891113,2530.50653560699
## "47",2610.936768,2606.43445968221
## "49",2372.001953,2394.53009586992
## "50",2231.733154,2401.52818819939
## "51",2178.499023,2267.50913133833
## "52",2246.364502,2206.16359617072
## "57",1813.217285,2031.1063999532
## "67",2198.58252,2350.90706916354
## "70",2120.026367,2347.0875812346
## "72",2111.403564,2182.76958548948
## "73",2139.664795,2148.04688371692
## "75",1940.083984,2074.24788546914
## "77",1911.175659,2036.38940688082
## "78",1880.382935,1961.30594250409
## "80",1895.552124,1945.53823511412
## "88",2233.366699,2225.36076883769
## "91",2380.956787,2328.88493879148
## "92",2466.961426,2409.17772388862
## "93",2536.209961,2485.14249202866
## "94",2561.852051,2551.35742823305
## "98",2827.328857,2738.99175847769
## "100",3157.23877,2898.5886647007
## "102",3167.856201,3015.06614417074
## "106",3322.21167,3049.78063242073
## "109",3156.509521,3300.0992895171
## "111",3020.089844,3014.17979598772
## "118",3224.915283,3170.18977183505
## "120",3270.60083,3095.34649219725
## "133",3427.340088,3437.67336176378
## "147",3155.523682,3077.6298509498
## "154",3001.678955,2860.95863738495
## "155",3307.516113,3003.7658490869
## "159",3518.518555,3369.65451780756
## "163",3575.716797,3540.95464790651
## "164",3425.852783,3555.39459251324
## "165",3545.354004,3406.12504084368
## "169",3862.634766,3762.18291699914
## "171",3847.104492,3800.34728754579
## "172",3748.760254,3817.5251255571
## "178",4087.903076,4129.07761583646
## "180",4131.102051,4172.79521580045
## "181",3930.257324,4090.93436591376
## "183",4414.746582,4241.37609346835
## "185",4288.074219,4274.11254921032
## "186",4324.626953,4240.925058137
## "187",4584.798828,4273.87881802207
## "191",4521.581055,4427.81337272547
## "194",4735.068848,4742.53531478254
## "198",4651.460449,4604.74417292752
## "199",4626.358887,4587.35935606972
## "209",4239.981445,4290.73375069301
## "212",4096.912109,4005.21929676855
## "215",4631.479004,4394.28654926663
## "220",4198.322754,4078.87655528093
## "221",4358.737305,4155.16868780537
## "227",4134.453125,4045.40957278891
## "235",3933.844482,3890.54408234582
## "236",4020.26001,3904.73926590222
## "238",4108.015625,3946.54970322589
## "248",3829.564941,3742.1360037568
## "251",3550.386963,3766.13804387614
## "252",3418.408203,3531.06495251353
## "256",3083.0979,3154.13834947831
## "258",3372.258301,3231.88027206721
## "260",3310.001465,3241.68337415162
## "261",3330.530762,3300.5050336875
## "263",3212.304932,3340.01359406254
## "266",3001.120117,3094.25621604201
## "268",2405.181152,2580.84957139389
## "277",2688.278809,2621.33388863883
## "278",2792.117188,2702.68620853615
## "282",3014.648193,2987.72967917911
## "283",3057.476074,3017.05966558488
## "284",3142.470703,3057.87486715801
## "286",3239.457031,3119.16196952856
## "292",3179.877197,2938.98005235376
## "293",3127.830078,3176.12715572934
## "295",2785.727539,2891.50749900645
## "298",2573.816162,2644.7893820302
## "300",2590.359619,2656.11384861225
## "305",2919.201172,2638.54786614947
## "308",2834.468994,2954.77052211098
## "309",2617.156006,2844.06873884967
## "310",2664.831055,2635.95405975029
## "311",2555.037354,2680.6213688079
## "313",2576.747559,2519.8860789292
## "318",2518.94458,2592.71354385514
## "322",2814.854492,2783.49033900097
## "329",3108.062012,3032.53614712857
## "330",3106.671387,3106.92229064769
## "336",3281.642822,3373.02343947467
## "338",3445.059326,3435.02065253681
## "345",3261.91626,3187.20920853105
## "346",3211.866943,3253.83575611963
## "347",2981.052246,3204.12726757784
## "351",3040.916504,3022.03372871869
## "352",3062.310303,3042.20830288249
## "355",3104.106445,3058.0173124201
## "359",2938.114014,2968.84790193421
## "361",3009.393555,2928.65786044861
## "364",2936.940918,2895.84185320806
mse <- mean((mod_eval$Actual - mod_eval$Predicted)^2)
mse
## [1] 18205.93
rmse <- sqrt(mse)
rmse
## [1] 134.9294