4063 Final复习资料

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 6

1.

Frank has 3 bags of gold, containing 10 ounces each.


Sally adds five ounces to one of his bags, 3 ounces to another, and 12 to the last.
Use Vectors to show how much gold Frank has in each bag.
Upload R code to Assignment 1 using R Markdown in a Word (docx) format
Frank <-c(10,10,10)
Sally <-c(5,3,12)
Total <-Frank+Sally
Total
2.
#Print out one cell from the dataframe.
Sale[2,1]
View(Sale)
#Split the dataframe into two pieces
split(Sale, Sale$Cost)
split(Sale, Sale$Profit)
#Add up all the values in one column of the dataframe
sum(Sale$Cost)
3.
#Create a dataframe with 5 columns.
C1<-c(1,2,3,4,5,6,7,8,9)
C2<-c(2,3,4,5,6,7,8,9,10)
C3<-c(3,4,5,6,7,8,9,10,11)
C4<-c(4,5,6,7,8,9,10,11,12)
C5<-c(5,6,7,8,9,10,11,12,13)
dataframe<-data.frame(C1,C2,C3,C4,C5)
View(dataframe)
#Boxplot
boxplot(dataframe)
#Scatterplot
plot(dataframe)
#Histogram
hist(dataframe)
#Calculate the standard deviation of the data in one column.
standard_deviation_C1<-sd(C1)
standard_deviation_C1
#Replace one of the datapoints with an outlier
dataframe[5,2]<-50
#Generate a new boxplot showing the outlier.
boxplot(dataframe)
4.
#Describing data
some_numbers<-c(13,15,16,20,30,4,5,6,7,8,90)
some_numbers<-some_numbers + some_numbers
some_numbers
#mean, median, range qiantile
mean_some_numbers<-mean(some_numbers)
mean_some_numbers

median_some_numbers<-median(some_numbers)
median_some_numbers

range_some_numbers<-range(some_numbers)
range_some_numbers

quantile_some_numbers<-quantile(some_numbers)
quantile_some_numbers

#standard deviation
standard_deviation_some_numbers<-sd(some_numbers)
standard_deviation_some_numbers

different_numbers<- c(1,3,4,5,6,7,7,7,3,8,10)
summary(some_numbers)
summary(different_numbers)
View(some_numbers)

#visualizing
plot(some_numbers)
some_numbers
plot(some_numbers, type = "h", col="pink", main="statistics", xlab="Value",ylab="number")
hist(some_numbers)
barplot(some_numbers)
boxplot(some_numbers)
some_dataframe<-data.frame(some_numbers,different_numbers)
some_dataframe
plot(some_dataframe, type = "h", col="pink", main="statistics", xlab="Value",ylab="number")
some_dataframe[10,2]<-50
some_dataframe
5.
mydata<-Stocks
str(mydata)
head(mydata)
View(mydata)

names(mydata) [1:12]<-c("day", "Stock1","Stock2","Stock3","Stock4", "Stock5",


"Stock6","Stock7", "Stock8", "Stock9","Stock10", "Rating")
names(mydata)[1:12]
str(mydata)

mydata$Stock1<-as.numeric(as.character(mydata$Stock1))
mydata$Stock2<-as.numeric(as.character(mydata$Stock2))
mydata$Stock3<-as.numeric(as.character(mydata$Stock3))
mydata$Stock4<-as.numeric(as.character(mydata$Stock4))
mydata$Stock5<-as.numeric(as.character(mydata$Stock5))
mydata$Stock6<-as.numeric(as.character(mydata$Stock6))
mydata$Stock7<-as.numeric(as.character(mydata$Stock7))
mydata$Stock8<-as.numeric(as.character(mydata$Stock8))
mydata$Stock9<-as.numeric(as.character(mydata$Stock9))
mydata$Stock10<-as.numeric(as.character(mydata$Stock10))
mydata$Rating<-as.factor(as.character(mydata$Rating))
str(mydata)
#check for null"NA"values
table(is.na(mydata))
complete.cases(mydata)
#remove NA's by overwriting with the mean of that column
mydata$Stock1[is.na(mydata$Stock1)]=mean(mydata$Stock1,na.rm = TRUE)
6.
#Use three columns of data
weather<-c("rainy","snow","sunny")
time<-c("urgent","adequate","adequate")
health<-c("bad","good","good")
lawn<-c("no","yes","yes")
dataset<-data.frame(weather,time,health,lawn)
View(dataset)
str(dataset)
dataset$weather<-as.factor(as.character(dataset$weather))
dataset$time<-as.factor(as.character(dataset$time))
dataset$health<-as.factor(as.character(dataset$health))
dataset$lawn<-as.factor(as.character(dataset$lawn))
str(dataset)
#Predicted
set.seed(999)
train=dataset[trainIndex, ]
test=dataset[-trainIndex, ]
print(table(dataset$lawn))
print(train(dataset$lawn))
NBclassfier=naiveBayes(lawn~weather+time+health, dataset=train)
print(NBclassfier)
7.Naive bayer
str(CreditRating)
CreditRating$PurchaseFrequency<-as.factor(as.character(CreditRating$PurchaseFrequency))
CreditRating$CreditRating<-as.factor(as.character(CreditRating$CreditRating))
CreditRating$Age<-as.factor(as.character(CreditRating$Age))
CreditRating$Approval<-as.factor(as.character(CreditRating$Approval))
str(CreditRating)

#Get package to divide data into training & test data


library(caret)
set.seed(7267166) #random number generator
trainIndex=createDataPartition(CreditRating$Approval, p=0.7)$Resample1
train=CreditRating[trainIndex, ]
test=CreditRating[-trainIndex, ]

# check the balance y predicted


print(table(CreditRating$Approval))

print(table(train$Approval))
NBclassfier=naiveBayes(Approval~CreditRating+Age+Approval, data=train)
print(NBclassfier)
8.
#load dataset: iris
data("iris")
#view structure of dataset
str(iris)
#view summary of dataset
summary(iris)
#view top row
head(iris)
#creat new dataset
iris.new<- iris[,c(1,2,3,4)]
iris.class<- iris[,"Species"]
#view top row
head(iris.new)
head(iris.class)
normalize <- function(x){return ((x-min(x))/(max(x)-min(x)))}
iris.new$Sepal.Length<- normalize(iris.new$Sepal.Length)
iris.new$Sepal.Width<- normalize(iris.new$Sepal.Width)
iris.new$Petal.Length<- normalize(iris.new$Petal.Length)
iris.new$Petal.Width<- normalize(iris.new$Petal.Width)
#view top row
head(iris.new)
#apply k-means clustering algorithm
result<- kmeans(iris.new,3)
#give number of each cluster
result$size
# gives value of cluster center datapoint value
result$centers
#gives cluster vector
result$cluster
#Verify results of clustering
par(mfrow=c(2,2), mar=c(5,4,2,2))
# Plot to see sepal length and sepal width been distributed in clusters
plot(iris.new[c(1,2)], col=result$cluster)
# Plot to see Sepal. length and Sepal. width been distributed originally as per "class"
attribute in dataset
plot(iris.new[c(1,2)], col=iris.class)
# Plot to see how Petal.Length and Petal.Width data points have been distributed in clusters
plot(iris.new[c(3,4)], col=result$cluster)
plot(iris.new[c(3,4)], col=iris.class)
#Result of table
table(result$cluster,iris.class)
9.
#explore data
View(grades)
str(grades)
table(grades$Level)
#clean data
grades$Level<-as.factor(as.character(grades$Level))
str(grades)
set.seed(2134)
ind<-sample(2,nrow(grades),replace = TRUE, prob = c(0.7,0.3))
train_set<-grades[ind==1,]
test_set<-grades[ind==2,]
nrow(train_set)
nrow(test_set)
#building desision tree
mytree<-tree(Level~Quiz1+Quiz2+Quiz3+Quiz4+Quiz5, data = train_set)
#summarize the model
summary(mytree)
#plot the tree
plot(mytree)
text(mytree,pretty=0, cex=0.6)
10.
#Import the data1.csv file into Rstudio.
Summarize the data.
Remove the last column and create a boxplot from the remaining columns.
Create a scatterplot of column 1 and column 3.
Calculate the correlation between column 1 and 3.
data1<-read.csv("data1.csv")
summary(data1)
data2<-data1[,-6]
boxplot(data2)
pairs(data2[, 1:3])
cor(data2[, 1:3])

You might also like