All "File - CSV" 'Texttype' 'String'

Download as rtf, pdf, or txt
Download as rtf, pdf, or txt
You are on page 1of 4

1.

clc
2. clear all
3.
4. filename = "file.csv";
5. data = readtable(filename,'TextType','string');
6. head(data)
7.
8. data.event_type = categorical(data.event_type);
9.
10. % f = figure;
11. % f.Position(3) = 1.5*f.Position(3);
12. figure
13. histogram(data.event_type);
14. xlabel("Class")
15. ylabel("Frequency")
16. title("Class Distribution")
17. %% Get the frequency counts of the classes and their names from the
histogram.
18.
19. classCounts = h.BinCounts;
20. classNames = h.Categories;
21. % Find the classes containing fewer than ten observations.
22.
23. idxLowCounts = classCounts < 10;
24. infrequentClasses = classNames(idxLowCounts)
25. %%
26. %The next step is to partition it into sets for training and validation.
27. %Partition the data into a training partition and a held-out partition
for validation and testing.
28. %Specify the holdout percentage to be 20%.
29. cvp = cvpartition(data.event_type,'Holdout',0.2);
30. dataTrain = data(training(cvp),:);
31. dataValidation = data(test(cvp),:);
32. %% Extract the text data and labels from the partitioned tables.
33. textDataTrain = dataTrain.event_narrative;
34. textDataValidation = dataValidation.event_narrative;
35. YTrain = dataTrain.event_type;
36. YValidation = dataValidation.event_type;
37. %%
38. figure
39. wordcloud(textDataTrain);
40. title("Training Data")
41. %% Pre processing
42. textDataTrain = erasePunctuation(textDataTrain);
43. textDataTrain = lower(textDataTrain);
44. documentsTrain = tokenizedDocument(textDataTrain);
45. %View the first few preprocessed training documents.
46. documentsTrain(1:5)
47. %% Word embedding
48. embeddingDimension = 100;
49. embeddingEpochs = 50;
50. emb = trainWordEmbedding(documentsTrain, ...
51. 'Dimension',embeddingDimension, ...
52. 'NumEpochs',embeddingEpochs, ...
53. 'Verbose',0)
54.
55. %% The trainingOptions function provides options to pad and truncate
input sequences automatically.
56. documentLengths = doclength(documentsTrain);
57. figure
58. histogram(documentLengths)
59. title("Document Lengths")
60. xlabel("Length")
61. ylabel("Number of Documents")
62. %% Convert the documents to sequences of numeric indices using
doc2sequence. To truncate or left-pad the sequences to have length 10,
set the 'Length' option to 10.
63. sequenceLength = 75;
64. documentsTruncatedTrain = docfun(@(words)
words(1:min(sequenceLength,end)),documentsTrain);
65. XTrain = doc2sequence(emb,documentsTruncatedTrain);
66. XTrain(1:5)
67. %%
68. emb = trainWordEmbedding(documentsTrain, ...
69. 'Dimension',embeddingDimension, ...
70. ylabel("Number of Documents")
71. %Convert the validation documents to sequences using the same options.
72. XValidation =
doc2sequence(enc,documentsValidation,'Length',sequenceLength);
73.
74.
75. %% Create and Train LSTM Network
76. inputSize = embeddingDimension;
77. outputSize = 180;
78. numClasses = numel(categories(YTrain));
79.
80. layers = [ ...
81. sequenceInputLayer(inputSize)
82. lstmLayer(outputSize,'OutputMode','last')
83. fullyConnectedLayer(numClasses)
84. softmaxLayer
85. classificationLayer]
86.
87. %% ..Specify Training Options
88. options = trainingOptions('adam', ...
89. 'GradientThreshold',1, ...
90. 'InitialLearnRate',0.01, ...
91. 'Plots','training-progress', ...
92. 'Verbose',0);
93. %% Train the LSTM network using the trainNetwork function.
94. net = trainNetwork(XTrain,YTrain,layers,options);
95.
96. %% Test LSTM Network
97. textDataTest = erasePunctuation(textDataTest);
98. textDataTest = lower(textDataTest);
99. documentsTest = tokenizedDocument(textDataTest);
100.
101. %%
102. documentsTruncatedTest = docfun(@(words)
words(1:min(sequenceLength,end)),documentsTest);
103. XTest = doc2sequence(emb,documentsTruncatedTest);
104. for i=1:numel(XTest)
105. XTest{i} = leftPad(XTest{i},sequenceLength);
106. end
107. XTest(1:5)
108.
109. %% Classify the test documents using the trained LSTM network.
110. YPred = classify(net,XTest);
111.
112. %% Calculate the classification accuracy
113. accuracy = sum(YPred == YTest)/numel(YPred)
114. %% ......Predict Using New Data
115. %%
116. reportsNew = [ ...
117. "Lots of water damage to computer equipment inside the
office."
118. "A large tree is downed and blocking traffic outside Apple
Hill."
119. "Damage to many car windshields in parking lot."
120. ];
121.
122. %Preprocess the text data using the preprocessing steps as the
training documents.
123.
124. reportsNew = lower(reportsNew);
125. reportsNew = erasePunctuation(reportsNew);
126. documentsNew = tokenizedDocument(reportsNew);
127. %% Convert the text data to sequences using doc2sequence with the
same options as when creating the training sequences.
128. documentsTruncatedNew = docfun(@(words)
words(1:min(sequenceLength,end)),documentsNew);
129. XNew = doc2sequence(emb,documentsTruncatedNew);
130. for i=1:numel(XNew)
131. XNew{i} = leftPad(XNew{i},sequenceLength);
132. end
133. %Classify the new sequences using the trained LSTM network.
134. [labelsNew,score] = classify(net,XNew);
135.
136. %Show the weather reports with their predicted labels.
137.
138. [reportsNew string(labelsNew)]
139.
140.
141.

You might also like