Added new data generation/split scripts

chaitjo · chaitjo · commit 2e987e74addb · 2019-08-24T15:55:53.000+08:00
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ source activate gcn-tsp-env
 
 # Install all dependencies and Jupyter Lab (for using notebooks).
 conda install pytorch=0.4.1 cuda90 -c pytorch
-conda install numpy==1.15.4 scipy==1.1.0 matplotlib==3.0.2 seaborn==0.9.0 pandas==0.24.2 networkx==2.2 scikit-learn==0.20.2 tensorflow-gpu==1.12.0 tensorboard==1.12.0
+conda install numpy==1.15.4 scipy==1.1.0 matplotlib==3.0.2 seaborn==0.9.0 pandas==0.24.2 networkx==2.2 scikit-learn==0.20.2 tensorflow-gpu==1.12.0 tensorboard==1.12.0 Cython
 pip3 install tensorboardx==1.5 fastprogress==0.1.18
 conda install -c conda-forge jupyterlab
 ```
@@ -77,6 +77,27 @@ python main.py --config <path-to-config.json>
 ```
 
 #### Splitting datasets into Training and Validation sets
-For TSP10, TSP20 and TSP30 datasets, everything is good to go.
+For TSP10, TSP20 and TSP30 datasets, everything is good to go once you download and extract the files.
 For TSP50 and TSP100, the 1M training set needs to be split into 10K validation samples and 999K training samples.
-Use the `split_train_val.ipynb` notebook to do this through Jupyter Lab.
+Use the `split_train_val.py` script to do so.
+For consistency, the script uses the first 10K samples in the 1M file as the validation set and the remaining 999K as the training set.
+
+```sh
+cd data
+python split_train_val.py --num_nodes <num-nodes>
+```
+
+### Generating new data
+New TSP data can be generated using the [Concorde solver](https://github.com/jvkersch/pyconcorde).
+
+```sh
+# Install the pyConcorde library in the /data directory
+cd data
+git clone https://github.com/jvkersch/pyconcorde
+cd pyconcorde
+pip install -e .
+cd ..
+
+# Run the data generation script
+python generate_tsp_concorde.py --num_samples <num-sample> --num_nodes <num-nodes>
+```
diff --git a/data/generate_tsp_concorde.py b/data/generate_tsp_concorde.py
@@ -0,0 +1,40 @@
+import time
+import argparse
+import pprint as pp
+import os
+
+import pandas as pd
+import numpy as np
+from concorde.tsp import TSPSolver
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_samples", type=int, default=10000)
+    parser.add_argument("--num_nodes", type=int, default=20)
+    parser.add_argument("--node_dim", type=int, default=2)
+    parser.add_argument("--filename", type=str, default=None)
+    opts = parser.parse_args()
+    
+    if opts.filename is None:
+        opts.filename = f"tsp{opts.num_nodes}_concorde.txt"
+    
+    # Pretty print the run args
+    pp.pprint(vars(opts))
+    
+    set_nodes_coord = np.random.random([opts.num_samples, opts.num_nodes, opts.node_dim])
+    with open(opts.filename, "w") as f:
+        start_time = time.time()
+        for nodes_coord in set_nodes_coord:
+            solver = TSPSolver.from_data(nodes_coord[:,0], nodes_coord[:,1], norm="GEO")  
+            solution = solver.solve()
+            f.write( " ".join( str(x)+str(" ")+str(y) for x,y in nodes_coord) )
+            f.write( str(" ") + str('output') + str(" ") )
+            f.write( str(" ").join( str(node_idx+1) for node_idx in solution.tour) )
+            f.write( str(" ") + str(solution.tour[0]+1) + str(" ") )
+            f.write( "\n" )
+        end_time = time.time() - start_time
+    
+    print(f"Completed generation of {opts.num_samples} samples of TSP{opts.num_nodes}.")
+    print(f"Total time: {end_time/3600:.1f}h")
+    print(f"Average time: {(end_time/3600)/opts.num_samples:.1f}h")
diff --git a/data/split_train_val.py b/data/split_train_val.py
@@ -0,0 +1,43 @@
+import time
+import argparse
+import pprint as pp
+import os
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_nodes", type=int, default=20)
+    parser.add_argument("--val_size", type=int, default=10000)
+    parser.add_argument("--node_dim", type=int, default=2)
+    parser.add_argument("--filename", type=str, default=None)
+    opts = parser.parse_args()
+    
+    if opts.filename is None:
+        opts.filename = f"tsp{opts.num_nodes}_concorde.txt"
+    
+    # Pretty print the run args
+    pp.pprint(vars(opts))
+    
+    start_time = time.time()
+    
+    filedata = open(opts.filename, "r").readlines()
+    print("Total samples: ", len(filedata))
+    val_data = filedata[:opts.val_size]
+    print("Validation samples: ", len(val_data))
+    train_data = filedata[opts.val_size:]
+    print("Training samples: ", len(train_data))
+    
+    # Create separate validation data file
+    with open("tsp{}_val_concorde.txt".format(opts.num_nodes), "w", encoding="utf-8") as f:
+        for line in val_data:
+            f.write(line)
+    
+    # Create separate train data file
+    with open("tsp{}_train_concorde.txt".format(opts.num_nodes), "w", encoding="utf-8") as f:
+        for line in train_data:
+            f.write(line)
+    
+    end_time = time.time() - start_time
+    
+    print(f"Total time: {end_time/3600:.1f}")
+    
diff --git a/split_train_val.ipynb b/split_train_val.ipynb