-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmarcextractbyid
More file actions
executable file
·157 lines (128 loc) · 4.14 KB
/
marcextractbyid
File metadata and controls
executable file
·157 lines (128 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
if [[ -z $1 ]];then
echo
echo "Usage: marcextractbyid [filename] [marcid_field]"
echo "This program expects a single identifier or input file of identifiers named 'ids'"
echo "marcid_field is optional. If not specified, it will be 999i if UUID is detected, 907a if sierra .b number is detected, 001 otherwise"
echo
echo "Usage: marcextractbyid [filename] [marcid_field]"
echo "Usage: marcextractbyid [filename]"
echo
echo "Usage: marcextractbyid [filename] [id_value] [marcid_field]"
echo "Usage: marcextractbyid [filename] [id_value]"
echo
exit
fi
infile="${1}"
if [[ -f ids ]];then
idtag=${2}
if [[ -z $idtag ]];then
head ids | grep -qP "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" && idtag="999i"
head ids | grep -qP "^\.?b\d{6}" && idtag="907a"
fi
echo "Finding records matching values in 'ids' file"
cat ids |tr "\n" $'\x1d' | tr -d "\r" |tr -d '"' > tmp_ids
singleid=0
else
idtag=${3}
if [[ -z $idtag ]];then
echo $idtag | grep -qP "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" && idtag="999i"
echo ${idtag} | grep -qP "^\.?b\d{6}" && idtag="907a"
fi
echo "${2}" |tr "\n" $'\x1d' |tr -d '"' > tmp_ids
echo "Looking for first record matching ${infile}"
singleid=1
fi
if [[ -z $idtag ]];then idtag="001";fi
fileroot=$(echo "${infile}" | sed 's/\.....\?$//')
matched="${fileroot}_matched.mrc"
notmatched="${fileroot}_not-matched.mrc"
rm -f "${matched}" "${notmatched}"
read -r -d '' awkscript << "ENDOFAWK"
#!/usr/bin/awk -f
BEGIN { foundrecs = 0
if (length("а") != 2) {
badawk = 1
printf("Your version of awk does not support marcextractbyid -- you need a version that supports the -b switch\\n")
exit
}
}
{
if (NR == FNR) {
gsub("\\n", "", $1)
gsub(" *$", "", $1)
seen[$1] = 1
next
}
leader=substr($0,1,24)
baseaddress=substr(leader,13, 5) + 0
directory=substr($0,25, baseaddress - 25)
directory_length=length(directory)
directory_check=(directory_length % 12)
record_content=substr($0, baseaddress + 1)
IDTAG3 = substr(IDTAG, 1, 3)
found = 0
if (directory_check == 0) {
for (i=1; i<=directory_length; i=i+12) {
if (substr(directory, i, 3) == IDTAG3) {
field_length = substr(directory, i + 3, 4) + 0
starting_pos = substr(directory, i + 7, 5)
if (length(IDTAG) == 3) {
if (IDTAG == "001") {
id_content = substr(record_content, starting_pos + 0, field_length - 1)
} else {
id_content = substr(record_content, starting_pos + 1, field_length - 1)
}
}
if (length(IDTAG) == 4) {
search_subfield = substr(IDTAG, 4, 1)
field_content = substr(record_content, starting_pos + 1, field_length - 1)
split(field_content, subfields, SFS)
for (subfield in subfields) {
if (substr(subfields[subfield], 1, 1) == search_subfield) {
id_content = substr(subfields[subfield],2)
break
}
}
}
}
}
gsub(" *$", "", id_content)
if (seen[id_content] == 1 ) { found = 1 }
id_content = ""
}
if (SINGLEID == 0) {
if (found == 1 ) {
foundrecs++
print $0 > MATCHED
} else {
notfoundrecs++
print $0 > NOTMATCHED
}
} else {
if (found == 1 ) {
print $0 > MATCHED
exit
}
}
recordcounter++
if (NR % 10000 == 0) if (NR % 10000 == 0){ printf "Records searched: %d Matching: %d Not matching: %d \r", recordcounter, foundrecs, notfoundrecs }
}
END { printf "Records searched: %d Matching: %d Not matching: %d \r", recordcounter, foundrecs, notfoundrecs }
ENDOFAWK
echo -e "${awkscript}" > tmp_checkmarc
chmod 700 tmp_checkmarc
awk -v ORS=$'\x1d' -v FS=$'\x1e' -v RS=$'\x1d' -v SFS=$'\x1f' -v IDTAG=${idtag} -v MATCHED="${matched}" -v NOTMATCHED="${notmatched}" -v SINGLEID=${singleid} -b -f tmp_checkmarc "tmp_ids" "${infile}"
echo
echo
if [[ ${singleid} == 0 ]];then
echo "Records matching ID file were output to ${matched} "
echo "Records not matching ID file were output to ${notmatched} "
echo
fi
if [[ ${singleid} == 1 && -f ${matched} ]];then
cat "${matched}" |text
echo
echo "Raw MARC record is in ${matched}"
echo
fi
rm -f tmp_checkmarc tmp_ids