1
1
import AdmZip from 'adm-zip' ;
2
2
import { deserialize } from 'bson' ;
3
- import { ObjectId } from 'mongodb' ;
4
- import { insert } from '../connector' ;
3
+ import isEqual from 'fast-deep-equal' ;
4
+ import { AnyBulkWriteOperation , Document , FindCursor , ObjectId } from 'mongodb' ;
5
+ import { bulkWrite , db , insert } from '../connector' ;
6
+
7
+ interface StaticAsset {
8
+ checksum : string ;
9
+ key : string ;
10
+ }
11
+
12
+ interface PageAst {
13
+ [ key : string ] : any ;
14
+ }
15
+
16
+ export interface UpdatedPage {
17
+ page_id : string ;
18
+ filename : string ;
19
+ ast : PageAst ;
20
+ static_assets : StaticAsset [ ] ;
21
+
22
+ created_at : Date ;
23
+ updated_at : Date ;
24
+ deleted : boolean ;
25
+ }
5
26
6
27
const COLLECTION_NAME = 'documents' ;
28
+ const UPDATED_AST_COLL_NAME = 'updated_documents' ;
7
29
8
30
// Service responsible for memoization of page level documents.
9
31
// Any extraneous logic performed on page level documents as part of upload should be added here
@@ -15,12 +37,175 @@ const pagesFromZip = (zip: AdmZip) => {
15
37
. map ( ( entry ) => deserialize ( entry . getData ( ) ) ) ;
16
38
} ;
17
39
18
- export const insertPages = async ( buildId : ObjectId , zip : AdmZip ) => {
40
+ /**
41
+ *
42
+ * Finds the page documents for a given Snooty project name + branch combination.
43
+ * If this is the first build for the Snooty project name + branch, no documents
44
+ * will be found.
45
+ *
46
+ * @param pageIdPrefix - Includes the Snooty project name, user (docsworker-xlarge), and branch
47
+ * @param collection - The collection to perform the find query on
48
+ */
49
+ const findPrevPageDocs = async ( pageIdPrefix : string , collection : string ) => {
50
+ const dbSession = await db ( ) ;
51
+ const findQuery = {
52
+ page_id : { $regex : new RegExp ( `^${ pageIdPrefix } ` ) } ,
53
+ deleted : false ,
54
+ } ;
55
+ const projection = {
56
+ _id : 0 ,
57
+ page_id : 1 ,
58
+ ast : 1 ,
59
+ } ;
60
+
61
+ try {
62
+ return dbSession . collection < UpdatedPage > ( collection ) . find ( findQuery ) . project ( projection ) ;
63
+ } catch ( error ) {
64
+ console . error (
65
+ `Error trying to find previous page documents using prefix ${ pageIdPrefix } in ${ collection } }: ${ error } `
66
+ ) ;
67
+ throw error ;
68
+ }
69
+ } ;
70
+
71
+ const createPageAstMapping = async ( docsCursor : FindCursor ) => {
72
+ // Create mapping for page id and its AST
73
+ const mapping : Record < string , object > = { } ;
74
+ // Create set of all page ids. To be used for tracking unseen pages in the current build
75
+ const pageIds = new Set < string > ( ) ;
76
+ for await ( const doc of docsCursor ) {
77
+ mapping [ doc . page_id ] = doc . ast ;
78
+ pageIds . add ( doc . page_id ) ;
79
+ }
80
+ return { mapping, pageIds } ;
81
+ } ;
82
+
83
+ class UpdatedPagesManager {
84
+ currentPages : Document [ ] ;
85
+ operations : AnyBulkWriteOperation [ ] ;
86
+ prevPageDocsMapping : Record < string , object > ;
87
+ prevPageIds : Set < string > ;
88
+
89
+ constructor ( prevPageDocsMapping : Record < string , object > , prevPagesIds : Set < string > , pages : Document [ ] ) {
90
+ this . currentPages = pages ;
91
+ this . operations = [ ] ;
92
+ this . prevPageDocsMapping = prevPageDocsMapping ;
93
+ this . prevPageIds = prevPagesIds ;
94
+
95
+ const updateTime = new Date ( ) ;
96
+ this . checkForPageDiffs ( updateTime ) ;
97
+ this . markUnseenPagesAsDeleted ( updateTime ) ;
98
+ }
99
+
100
+ /**
101
+ *
102
+ * Compares the ASTs of the current pages with the previous pages. New update
103
+ * operations are added whenever a diff in the page ASTs is found. Page IDs are
104
+ * removed from `prevPageIds` to signal that the previous page has been "seen"
105
+ *
106
+ * @param updateTime - the time to set updates to
107
+ */
108
+ checkForPageDiffs ( updateTime : Date ) {
109
+ this . currentPages . forEach ( ( page ) => {
110
+ // Filter out rst (non-page) files
111
+ if ( ! page . filename . endsWith ( '.txt' ) ) {
112
+ return ;
113
+ }
114
+
115
+ const currentPageId = page . page_id ;
116
+ this . prevPageIds . delete ( currentPageId ) ;
117
+
118
+ // Update the document if page's current AST is different from previous build's.
119
+ // New pages should always count as having a "different" AST
120
+ if ( ! isEqual ( page . ast , this . prevPageDocsMapping [ currentPageId ] ) ) {
121
+ const operation = {
122
+ updateOne : {
123
+ filter : { page_id : currentPageId } ,
124
+ update : {
125
+ $set : {
126
+ page_id : currentPageId ,
127
+ filename : page . filename ,
128
+ ast : page . ast ,
129
+ static_assets : page . static_assets ,
130
+ updated_at : updateTime ,
131
+ deleted : false ,
132
+ } ,
133
+ $setOnInsert : {
134
+ created_at : updateTime ,
135
+ } ,
136
+ } ,
137
+ upsert : true ,
138
+ } ,
139
+ } ;
140
+ this . operations . push ( operation ) ;
141
+ }
142
+ } ) ;
143
+ }
144
+
145
+ /**
146
+ *
147
+ * Marks any pages from the previous build that were not used as "deleted"
148
+ *
149
+ * @param updateTime - the time to set updates to
150
+ */
151
+ markUnseenPagesAsDeleted ( updateTime : Date ) {
152
+ this . prevPageIds . forEach ( ( unseenPageId ) => {
153
+ const operation = {
154
+ updateOne : {
155
+ filter : { page_id : unseenPageId } ,
156
+ update : {
157
+ $set : {
158
+ deleted : true ,
159
+ updated_at : updateTime ,
160
+ } ,
161
+ } ,
162
+ } ,
163
+ } ;
164
+ this . operations . push ( operation ) ;
165
+ } ) ;
166
+ }
167
+
168
+ getOperations ( ) {
169
+ return this . operations ;
170
+ }
171
+ }
172
+
173
+ /**
174
+ *
175
+ * Upserts pages in separate collection. Copies of a page are created by page_id.
176
+ * Updated pages within the same Snooty project name + branch should only update
177
+ * related page documents.
178
+ *
179
+ * @param pages
180
+ * @param collection
181
+ */
182
+ const updatePages = async ( pages : Document [ ] , collection : string ) => {
183
+ if ( pages . length === 0 ) {
184
+ return ;
185
+ }
186
+
187
+ // Find all pages that share the same project name + branch. Expects page IDs
188
+ // to include these two properties after parse
189
+ const pageIdPrefix = pages [ 0 ] . page_id . split ( '/' ) . slice ( 0 , 3 ) . join ( '/' ) ;
190
+ const previousPagesCursor = await findPrevPageDocs ( pageIdPrefix , collection ) ;
191
+ const { mapping : prevPageDocsMapping , pageIds : prevPageIds } = await createPageAstMapping ( previousPagesCursor ) ;
192
+
193
+ const updatedPagesManager = new UpdatedPagesManager ( prevPageDocsMapping , prevPageIds , pages ) ;
194
+ const operations = updatedPagesManager . getOperations ( ) ;
195
+
196
+ if ( operations . length > 0 ) {
197
+ await bulkWrite ( operations , collection ) ;
198
+ }
199
+ } ;
200
+
201
+ export const insertAndUpdatePages = async ( buildId : ObjectId , zip : AdmZip ) => {
19
202
try {
20
- const pages = await pagesFromZip ( zip ) ;
21
- return insert ( pages , COLLECTION_NAME , buildId ) ;
203
+ const pages = pagesFromZip ( zip ) ;
204
+ return Promise . all ( [ insert ( pages , COLLECTION_NAME , buildId ) , updatePages ( pages , UPDATED_AST_COLL_NAME ) ] ) ;
22
205
} catch ( error ) {
23
206
console . error ( `Error at insertion time for ${ COLLECTION_NAME } : ${ error } ` ) ;
24
207
throw error ;
25
208
}
26
209
} ;
210
+
211
+ export const _updatePages = updatePages ;
0 commit comments