-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathevaluation.py
More file actions
97 lines (84 loc) · 3.51 KB
/
evaluation.py
File metadata and controls
97 lines (84 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
'''
effectiveness evaluation内容
1. 自己接受了多少条action_history
2. 自己的action_space
3. 收到了多少个belief
4. 收到了多少个relationship
5. update分数是否在规定范围内
reasonability evaluation
1. 生成thought、goal、action来让GPT判断行为是否一致
2. 某个角色说的所有内容
3. 所有角色
4. 设计特定的action history来判断
- relationship的更新值
- belief的更新值
- guess的合理性
- vote的合理性
- vote except self的合理性
5. 对于所有结局,guess、vote、vote except self之间的差异性
6.
'''
import collections
import json
import os.path
import numpy
from config import TEST_FOLDER
# evaluate_saving_dir = './storage/test_version'
# evaluate_saving_dir = './storage/succession/saving/gpt35_7'
suc_dir = './storage/succession/saving'
# action_history_dir = os.path.join(evaluate_saving_dir, 'action_history')
chn = True
qe = False
n_gram=2
if qe:
print('='*50)
print('Quantity Evaluation')
print('='*50)
print()
for evaluate_saving in os.listdir(suc_dir):
if evaluate_saving == 'initial_version': continue
evaluate_saving_dir = os.path.join(suc_dir, evaluate_saving)
if not os.path.isdir(evaluate_saving_dir): continue
action_history_dir = os.path.join(evaluate_saving_dir, 'action_history')
line_to_be_evaluated = collections.defaultdict(list)
for json_file in os.listdir(action_history_dir):
json_data = open(os.path.join(action_history_dir, json_file), encoding='utf-8')
for json_line in json_data:
json_line = json.loads(json_line)
action_type = json_line['action_type']
action = json_line['action']
agent_response = action.split('agent response: ')[-1].split('[SEP]')[0].strip()
ground_truth = action.split('ground truth: ')[-1].strip()
if action_type.startswith('### EVALUATION') and agent_response != '[SKIP]':
line_to_be_evaluated[action_type].append([agent_response, ground_truth])
if line_to_be_evaluated:
print(evaluate_saving)
[print(j,'mean=',numpy.mean([1 if i[0] == i[1] else 0 for i in line_to_be_evaluated[j]])) for j in line_to_be_evaluated]
print('='*50)
print('Entropy Evaluation')
print('='*50)
print()
for evaluate_saving in os.listdir(suc_dir):
if evaluate_saving == 'initial_version': continue
evaluate_saving_dir = os.path.join(suc_dir, evaluate_saving)
if not os.path.isdir(evaluate_saving_dir): continue
action_history_dir = os.path.join(evaluate_saving_dir, 'action_history')
n_gram_dict = collections.defaultdict(int)
count = 0
for json_file in os.listdir(action_history_dir):
json_data = open(os.path.join(action_history_dir, json_file), encoding='utf-8')
for json_line in json_data:
json_line = json.loads(json_line)
action_type = json_line['action_type']
action = json_line['action']
if not chn:
action = action.split(' ')
if action_type == '### SAY' or action_type.startswith('### SPEECH'):
for i in range(len(action)-n_gram+1):
n_gram_dict[action[i:i+n_gram]] += 1
count += 1
entropy = 0
for key, item in n_gram_dict.items():
p = item/count
entropy -= p * numpy.log(p)
print('%30s, %d-gram count: %7d, Entropy: %7.5f, All Token: %10d'%(evaluate_saving,n_gram, len(n_gram_dict),entropy, count))