-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_data.py
37 lines (31 loc) · 1.18 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# importing necessary libraries
import pandas as pd
import json
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Reading the first file (Data for dec 2019)
with open("16119_webhose_2019_12_db21c91a1ab47385bb13773ed8238c31_0000001.json", "r", encoding="utf8") as file1:
json_data1 = [json.loads(line) for line in file1]
len(json_data1)
# Reading the second json (Data for jan 2020)
with open("16119_webhose_2020_01_db21c91a1ab47385bb13773ed8238c31_0000001.json", "r", encoding="utf8") as file2:
json_data2 = [json.loads(line) for line in file2]
len(json_data2)
# Extracting the value of the text key and title key from the 2 jsons and putting them in the dataset and target lists respectively
dataset = []
target = []
for dict1 in json_data1:
text1 = dict1.get("text")
dataset.append(text1)
title1 = dict1.get("title")
target.append(title1)
for dict2 in json_data2:
text2 = dict2.get("text")
dataset.append(text1)
title2 = dict2.get("title")
target.append(title2)
print("The length of dataset is:", len(dataset))
print("The length of dataset is:", len(target))