Skip to content

Commit 4fecd06

Browse files
committed
dotnet#83 Move Pipeline module to separate project. Update README file.
1 parent 67c3b38 commit 4fecd06

File tree

8 files changed

+144
-93
lines changed

8 files changed

+144
-93
lines changed

samples/fsharp/getting-started/GettingStarted.sln

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Clustering_Iris", "Clusteri
1111
EndProject
1212
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "BinaryClassification_SentimentAnalysis", "BinaryClassification_SentimentAnalysis\BinaryClassification_SentimentAnalysis.fsproj", "{7AE870E2-3C28-45F6-840B-FBCC3234FFE4}"
1313
EndProject
14+
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "Microsoft.ML.Core.FSharp", "Microsoft.ML.Core.FSharp\Microsoft.ML.Core.FSharp.fsproj", "{FE227631-1E97-4D25-BAB1-96097C90E9C7}"
15+
EndProject
1416
Global
1517
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1618
Debug|Any CPU = Debug|Any CPU
@@ -33,6 +35,10 @@ Global
3335
{7AE870E2-3C28-45F6-840B-FBCC3234FFE4}.Debug|Any CPU.Build.0 = Debug|Any CPU
3436
{7AE870E2-3C28-45F6-840B-FBCC3234FFE4}.Release|Any CPU.ActiveCfg = Release|Any CPU
3537
{7AE870E2-3C28-45F6-840B-FBCC3234FFE4}.Release|Any CPU.Build.0 = Release|Any CPU
38+
{FE227631-1E97-4D25-BAB1-96097C90E9C7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
39+
{FE227631-1E97-4D25-BAB1-96097C90E9C7}.Debug|Any CPU.Build.0 = Debug|Any CPU
40+
{FE227631-1E97-4D25-BAB1-96097C90E9C7}.Release|Any CPU.ActiveCfg = Release|Any CPU
41+
{FE227631-1E97-4D25-BAB1-96097C90E9C7}.Release|Any CPU.Build.0 = Release|Any CPU
3642
EndGlobalSection
3743
GlobalSection(SolutionProperties) = preSolution
3844
HideSolutionNode = FALSE
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>netstandard2.0</TargetFramework>
5+
</PropertyGroup>
6+
7+
<ItemGroup>
8+
<Compile Include="Pipeline.fs" />
9+
</ItemGroup>
10+
11+
<ItemGroup>
12+
<PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
13+
</ItemGroup>
14+
15+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
namespace Microsoft.ML.Core.FSharp
2+
3+
module Pipeline =
4+
open Microsoft.ML.Core.Data
5+
open Microsoft.ML.Runtime.Data
6+
7+
8+
let textTransform (inputColumn : string) outputColumn env =
9+
TextTransform(env, inputColumn, outputColumn)
10+
11+
let concatEstimator name source env =
12+
ConcatEstimator(env,name, source)
13+
14+
let append (estimator : IEstimator<'a>) (pipeline : IEstimator<'b>) =
15+
match pipeline with
16+
| :? IEstimator<ITransformer> as p ->
17+
p.Append estimator
18+
| _ -> failwith "The pipeline has to be an instance of IEstimator<ITransformer>."
19+
20+
let fit (dataView : IDataView) (pipeline : EstimatorChain<'a>) =
21+
pipeline.Fit dataView

samples/fsharp/getting-started/Regression_TaxiFarePrediction/Program.fs

+9-32
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,17 @@
22

33
open System
44
open System.IO
5+
open System.Diagnostics
56

67
open Microsoft.ML.Runtime.Learners
78
open Microsoft.ML.Runtime.Data
89
open Microsoft.ML
910
open Microsoft.ML.Core.Data
1011
open Microsoft.ML.Runtime.Api
1112
open Microsoft.ML.Legacy
13+
open Microsoft.ML.Core.FSharp
14+
1215
open PLplot
13-
open System.Diagnostics
1416

1517

1618
let AppPath = Path.Combine(__SOURCE_DIRECTORY__, "../../../..")
@@ -37,31 +39,6 @@ type TaxiTripFarePrediction = {
3739
}
3840

3941

40-
module Pipeline =
41-
open Microsoft.ML.Core.Data
42-
43-
let private downcastEstimator (b : IEstimator<'a>) =
44-
match b with
45-
| :? IEstimator<ITransformer> as b -> b
46-
| _ -> failwith "qwe"
47-
48-
49-
let textTransform (inputColumn : string) outputColumn env =
50-
TextTransform(env, inputColumn, outputColumn)
51-
52-
let concatEstimator name source env =
53-
ConcatEstimator(env,name, source)
54-
55-
let append (estimator : IEstimator<'a>) (pipeline : IEstimator<'b>) =
56-
match pipeline with
57-
| :? IEstimator<ITransformer> as p ->
58-
p.Append estimator
59-
| _ -> failwith "The pipeline has to be an instance of IEstimator<ITransformer>."
60-
61-
let fit (dataView : IDataView) (pipeline : EstimatorChain<'a>) =
62-
pipeline.Fit dataView
63-
64-
6542
let createTaxiFareDataFileLoader mlcontext =
6643
TextLoader(
6744
mlcontext,
@@ -102,7 +79,7 @@ let buildAndTrain mlcontext =
10279
|> Pipeline.append(new Normalizer(mlcontext, "PassengerCount", Normalizer.NormalizerMode.MeanVariance))
10380
|> Pipeline.append(new Normalizer(mlcontext, "TripTime", Normalizer.NormalizerMode.MeanVariance))
10481
|> Pipeline.append(new Normalizer(mlcontext, "TripDistance", Normalizer.NormalizerMode.MeanVariance))
105-
|> Pipeline.append(new ConcatEstimator(mlcontext, "Features", "VendorId", "RateCode", "PassengerCount", "TripTime", "TripDistance", "PaymentType"));
82+
|> Pipeline.append(new ConcatEstimator(mlcontext, "Features", "VendorId", "RateCode", "PassengerCount", "TripTime", "TripDistance", "PaymentType"))
10683

10784
// We apply our selected Trainer (SDCA Regression algorithm)
10885
let pipelineWithTrainer =
@@ -212,19 +189,19 @@ let plotRegressionChart (model : ITransformer) testDataSetPath numberOfRecordsTo
212189
let xMaxLimit = 40. //Rides larger than $40 are not shown in the chart
213190
let yMinLimit = 0.
214191
let yMaxLimit = 40. //Rides larger than $40 are not shown in the chart
215-
pl.env(xMinLimit, xMaxLimit, yMinLimit, yMaxLimit, AxesScale.Independent, AxisBox.BoxTicksLabelsAxes);
192+
pl.env(xMinLimit, xMaxLimit, yMinLimit, yMaxLimit, AxesScale.Independent, AxisBox.BoxTicksLabelsAxes)
216193

217194
// Set scaling for main title text 125% size of default
218195
pl.schr(0., 1.25)
219196

220197
// The main title
221-
pl.lab("Measured", "Predicted", "Distribution of Taxi Fare Prediction");
198+
pl.lab("Measured", "Predicted", "Distribution of Taxi Fare Prediction")
222199

223200
// plot using different colors
224201
// see http://plplot.sourceforge.net/examples.php?demo=02 for palette indices
225202
pl.col0 1
226203

227-
let totalNumber = numberOfRecordsToRead;
204+
let totalNumber = numberOfRecordsToRead
228205
let testData = getDataFromCsv testDataSetPath totalNumber
229206

230207
//This code is the symbol to paint
@@ -249,7 +226,7 @@ let plotRegressionChart (model : ITransformer) testDataSetPath numberOfRecordsTo
249226
let y = [| float farePrediction.FareAmount |]
250227

251228
//Paint a dot
252-
pl.poin(x, y, code);
229+
pl.poin(x, y, code)
253230

254231
xTotal <- xTotal + float x.[0]
255232
yTotal <- yTotal + float y.[0]
@@ -280,7 +257,7 @@ let plotRegressionChart (model : ITransformer) testDataSetPath numberOfRecordsTo
280257
let b = minY - (m * minX)
281258

282259
//Generic function for Y for the regression line
283-
// y = (m * x) + b;
260+
// y = (m * x) + b
284261

285262
let x1 = 1.
286263
//Function for Y1 in the line

samples/fsharp/getting-started/Regression_TaxiFarePrediction/README.md

+88-61
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,8 @@
22

33
| ML.NET version | API type | Status | App Type | Data type | Scenario | ML Task | Algorithms |
44
|----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------|
5-
| v0.6 | LearningPipeline API | Needs update to Dynamic API: [Contribute](/CONTRIBUTING.md) | Console app | .csv files | Price prediction | Regression | Sdca Regression |
5+
| v0.6 | Dynamic API | Up-to date | Console app | .csv files | Price prediction | Regression | Sdca Regression |
66

7-
------------------------------------
8-
9-
**Important**: This F# sample needs to be updated to the new dynamic API available since ML.NET 0.6. It currently uses the deprecated LearningPipeline API.
10-
11-
Contribution from the community will be welcomed!
12-
13-
------------------------------------
147

158
In this introductory sample, you'll see how to use [ML.NET](https://www.microsoft.com/net/learn/apps/machine-learning-and-ai/ml-dotnet) to predict taxi fares. In the world of machine learning, this type of prediction is known as **regression**.
169

@@ -38,60 +31,83 @@ The common feature for all those examples is that the parameter we want to predi
3831
## Solution
3932
To solve this problem, first we will build an ML model. Then we will train the model on existing data, evaluate how good it is, and lastly we'll consume the model to predict taxi fares.
4033

41-
![Build -> Train -> Evaluate -> Consume](../../../../../master/samples/csharp/getting-started/shared_content/modelpipeline.png)
34+
![Build -> Train -> Evaluate -> Consume](../shared_content/modelpipeline.png)
4235

4336
### 1. Build model
4437

45-
Building a model includes: uploading data (`taxi-fare-train.csv` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (with `ColumnCopier`,`CategoricalOneHotVectorizer`,`ColumnConcatenator`), and choosing a learning algorithm (`FastTreeRegressor`). All of those steps are stored in a `LearningPipeline`:
38+
Building a model includes: uploading data (`taxi-fare-train.csv` with `TextLoader`), transforming the data so it can be used effectively by an ML algorithm (`FastTreeRegressor` in this case):
39+
4640
```fsharp
4741
// LearningPipeline holds all steps of the learning process: data, transforms, learners.
48-
let pipeline = LearningPipeline()
49-
50-
// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing
51-
// all the column names and their types. This will be used to create the model, and train it.
52-
pipeline.Add(TextLoader(TrainDataPath).CreateFrom<TaxiTrip>(separator=',')
53-
// Transforms
54-
// When ML model starts training, it looks for two columns: Label and Features.
55-
// Label: values that should be predicted. If you have a field named Label in your data type,
56-
// no extra actions required.
57-
// If you don’t have it, like in this example, copy the column you want to predict with
58-
// ColumnCopier transform:
59-
pipeline.Add(ColumnCopier(("FareAmount", "Label")))
60-
61-
// CategoricalOneHotVectorizer transforms categorical (string) values into 0/1 vectors
62-
pipeline.Add(CategoricalOneHotVectorizer("VendorId",
63-
"RateCode",
64-
"PaymentType"))
65-
66-
// Features: all data used for prediction. At the end of all transforms you need to concatenate
67-
// all columns except the one you want to predict into Features column with
68-
// ColumnConcatenator transform:
69-
pipeline.Add(ColumnConcatenator("Features",
70-
"VendorId",
71-
"RateCode",
72-
"PassengerCount",
73-
"TripDistance",
74-
"PaymentType"))
75-
76-
//FastTreeRegressor is an algorithm that will be used to train the model.
77-
pipeline.Add(FastTreeRegressor())
42+
let mlcontext = new LocalEnvironment()
43+
44+
// Create the TextLoader by defining the data columns and where to find (column position) them in the text file.
45+
TextLoader(
46+
mlcontext,
47+
TextLoader.Arguments(
48+
Separator = ",",
49+
HasHeader = true,
50+
Column =
51+
[|
52+
TextLoader.Column("VendorId", Nullable DataKind.Text, 0)
53+
TextLoader.Column("RateCode", Nullable DataKind.Text, 1)
54+
TextLoader.Column("PassengerCount", Nullable DataKind.R4, 2)
55+
TextLoader.Column("TripTime", Nullable DataKind.R4, 3)
56+
TextLoader.Column("TripDistance", Nullable DataKind.R4, 4)
57+
TextLoader.Column("PaymentType", Nullable DataKind.Text, 5)
58+
TextLoader.Column("FareAmount", Nullable DataKind.R4, 6)
59+
|]
60+
)
61+
)
62+
63+
// Now read the file (remember though, readers are lazy, so the actual reading will happen when 'fitting').
64+
let dataView = MultiFileSource(TrainDataPath) |> textLoader.Read
65+
66+
//Copy the Count column to the Label column
67+
let pipeline =
68+
CopyColumnsEstimator(mlcontext, "FareAmount", "Label")
69+
|> Pipeline.append(new CategoricalEstimator(mlcontext, "VendorId"))
70+
|> Pipeline.append(new CategoricalEstimator(mlcontext, "RateCode"))
71+
|> Pipeline.append(new CategoricalEstimator(mlcontext, "PaymentType"))
72+
|> Pipeline.append(new Normalizer(mlcontext, "PassengerCount", Normalizer.NormalizerMode.MeanVariance))
73+
|> Pipeline.append(new Normalizer(mlcontext, "TripTime", Normalizer.NormalizerMode.MeanVariance))
74+
|> Pipeline.append(new Normalizer(mlcontext, "TripDistance", Normalizer.NormalizerMode.MeanVariance))
75+
|> Pipeline.append(new ConcatEstimator(mlcontext, "Features", "VendorId", "RateCode", "PassengerCount", "TripTime", "TripDistance", "PaymentType"))
76+
77+
// We apply our selected Trainer (SDCA Regression algorithm)
78+
let pipelineWithTrainer =
79+
pipeline
80+
|> Pipeline.append(new SdcaRegressionTrainer(mlcontext, new SdcaRegressionTrainer.Arguments(), "Features", "Label"))
7881
```
7982

8083
### 2. Train model
81-
Training the model is a process of running the chosen algorithm on a training data (with known fare values) to tune the parameters of the model. It is implemented in the `Train()` API. To perform training we just call the method and provide the types for our data object `TaxiTrip` and prediction object `TaxiTripFarePrediction`.
84+
Training the model is a process of running the chosen algorithm on a training data (with known fare values) to tune the parameters of the model. It is implemented in the `Fit()` API. To perform training we just call the method while providing the DataView.
8285

8386
```fsharp
84-
let model = pipeline.Train<TaxiTrip, TaxiTripFarePrediction>()
87+
let model = pipelineWithTrainer.Fit dataView
8588
```
8689

8790
### 3. Evaluate model
8891
We need this step to conclude how accurate our model operates on new data. To do so, the model from the previous step is run against another dataset that was not used in training (`taxi-fare-test.csv`). This dataset also contains known fares. `RegressionEvaluator` calculates the difference between known fares and values predicted by the model in various metrics.
8992

9093
```fsharp
91-
let testData = TextLoader(TestDataPath).CreateFrom<TaxiTrip>(separator=',')
92-
93-
let evaluator = RegressionEvaluator()
94-
let metrics = evaluator.Evaluate(model, testData)
94+
let testDataView = MultiFileSource testDataLocation |> textLoader.Read
95+
96+
printfn "=============== Evaluating Model's accuracy with Test data==============="
97+
98+
let predictions = model.Transform testDataView
99+
100+
let regressionCtx = RegressionContext mlcontext
101+
let metrics = regressionCtx.Evaluate(predictions, "Label", "Score")
102+
let algorithmName = "SdcaRegressionTrainer"
103+
printfn "*************************************************"
104+
printfn "* Metrics for %s" algorithmName
105+
printfn "*------------------------------------------------"
106+
printfn "* R2 Score: %.2f" metrics.RSquared
107+
printfn "* RMS loss: %.2f" metrics.Rms
108+
printfn "* Absolute loss: %.2f" metrics.L1
109+
printfn "* Squared loss: %.2f" metrics.L2
110+
printfn "*************************************************"
95111
```
96112

97113
>*To learn more on how to understand the metrics, check out the Machine Learning glossary from the [ML.NET Guide](https://docs.microsoft.com/en-us/dotnet/machine-learning/) or use any available materials on data science and machine learning*.
@@ -104,20 +120,31 @@ If you are not satisfied with the quality of the model, there are a variety of w
104120
After the model is trained, we can use the `Predict()` API to predict the fare amount for specified trip.
105121

106122
```fsharp
107-
let prediction = model.Predict(TestTaxiTrips.Trip1)
108-
Console.WriteLine(sprintf "Predicted fare: {prediction.FareAmount:0.####}, actual fare: 29.5")
123+
//Prediction test
124+
// Create prediction engine and make prediction.
125+
let engine = model.MakePredictionFunction<TaxiTrip, TaxiTripFarePrediction> mlcontext
126+
127+
//Sample:
128+
//vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
129+
//VTS,1,1,1140,3.75,CRD,15.5
130+
let taxiTripSample = {
131+
VendorId = "VTS"
132+
RateCode = "1"
133+
PassengerCount = 1.0f
134+
TripTime = 1140.0f
135+
TripDistance = 3.75f
136+
PaymentType = "CRD"
137+
FareAmount = 0.0f // To predict. Actual/Observed = 15.5
138+
}
139+
140+
let prediction = engine.Predict taxiTripSample
141+
printfn "**********************************************************************"
142+
printfn "Predicted fare: %.4f, actual fare: 29.5" prediction.FareAmount
143+
printfn "**********************************************************************"
109144
```
110-
Where `TestTaxiTrips.Trip1` stores the information about the trip we'd like to get the prediction for.
111145

112-
```fsharp
113-
module TestTaxiTrips =
114-
let Trip1 =
115-
TaxiTrip(
116-
VendorId = "VTS",
117-
RateCode = "1",
118-
PassengerCount = 1.0,
119-
TripDistance = 10.33,
120-
PaymentType = "CSH",
121-
FareAmount = 0.0 // predict it. actual = 29.5
122-
)
123-
```
146+
147+
Finally, you can plot in a chart how the tested predictions are distributed and how the regression is performing with the implemented method `PlotRegressionChart()` as in the following screenshot:
148+
149+
150+
![Regression plot-chart](images/Sample-Regression-Chart.png)

samples/fsharp/getting-started/Regression_TaxiFarePrediction/Regression_TaxiFarePrediction.fsproj

+5
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,16 @@
1010
<Folder Include="datasets\" />
1111
<None Include="..\..\..\..\datasets\taxi-fare-test.csv" Link="datasets\taxi-fare-test.csv" />
1212
<None Include="..\..\..\..\datasets\taxi-fare-train.csv" Link="datasets\taxi-fare-train.csv" />
13+
<None Include="README.md" />
1314
</ItemGroup>
1415

1516
<ItemGroup>
1617
<PackageReference Include="Microsoft.ML" Version="$(MicrosoftMLVersion)" />
1718
<PackageReference Include="PLplot" Version="5.13.7" />
1819
</ItemGroup>
1920

21+
<ItemGroup>
22+
<ProjectReference Include="..\Microsoft.ML.Core.FSharp\Microsoft.ML.Core.FSharp.fsproj" />
23+
</ItemGroup>
24+
2025
</Project>

0 commit comments

Comments
 (0)