66 "hash/fnv"
77 "html/template"
88 "sort"
9- "strconv"
10- "strings"
119 tmpl "text/template"
1210 "unicode/utf8"
1311
@@ -18,16 +16,16 @@ import (
1816 "github.com/rs/zerolog/log"
1917)
2018
21- type CSVKey struct {
22- filename string
23- lineKey string
24- }
25-
2619type JaccardCSV struct {
2720 csvLine model.Entry
2821 lineKey string
2922}
3023
24+ type (
25+ fileuri string
26+ linekey string
27+ )
28+
3129type MaskEngine struct {
3230 seeder model.Seeder
3331 templateURI * template.Template
@@ -36,8 +34,7 @@ type MaskEngine struct {
3634 temJaccardCSV * tmlmask.Engine // template to compute key for a csv entry
3735 temJaccardEntry * tmlmask.Engine // template to compute key for json entry
3836 expected string
39- csvAllreadyRead map [string ][]model.Dictionary
40- csvEntryByKey map [CSVKey ][]model.Entry
37+ csvEntryByKey map [fileuri ]map [linekey ][]model.Entry
4138 header bool
4239 sep rune
4340 comment rune
@@ -115,8 +112,7 @@ func NewMask(conf model.FindInCSVType, seed int64, seeder model.Seeder) (MaskEng
115112 temJaccardCSV ,
116113 temJaccardEntry ,
117114 expected ,
118- map [string ][]model.Dictionary {},
119- map [CSVKey ][]model.Entry {},
115+ map [fileuri ]map [linekey ][]model.Entry {},
120116 conf .Header ,
121117 sep , comment , conf .FieldsPerRecord , conf .TrimSpace ,
122118 }, err
@@ -133,10 +129,10 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
133129 if err := me .templateURI .Execute (& filenameBuffer , context [0 ].UnpackUnordered ()); err != nil {
134130 return nil , err
135131 }
136- filename := filenameBuffer .String ()
132+ filename := fileuri ( filenameBuffer .String () )
137133
138134 // Get ExactMatch results
139- exactMatchFinded , exactMatchResult , err := me .ExactMatch (filename , context )
135+ exactMatchFinded , exactMatchResult , err := me .exactMatch (filename , context )
140136 if err != nil {
141137 return nil , err
142138 }
@@ -154,7 +150,7 @@ func (me *MaskEngine) Mask(e model.Entry, context ...model.Dictionary) (model.En
154150}
155151
156152// getJaccardMatchResults calculates Jaccard similarity for the given CSV filename and exact match results.
157- func (me * MaskEngine ) getJaccardMatchResults (filename string , exactMatchResults []model.Entry , context []model.Dictionary ) ([]model.Entry , error ) {
153+ func (me * MaskEngine ) getJaccardMatchResults (filename fileuri , exactMatchResults []model.Entry , context []model.Dictionary ) ([]model.Entry , error ) {
158154 var jaccardEntryBuffer bytes.Buffer
159155 if err := me .temJaccardEntry .Execute (& jaccardEntryBuffer , context [0 ].UnpackUnordered ()); err != nil {
160156 return nil , err
@@ -163,19 +159,14 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
163159
164160 // If no exactMatch config
165161 if len (exactMatchResults ) < 1 {
166- var csvList []model.Dictionary
167- if _ , ok := me .csvAllreadyRead [filename ]; ! ok {
168- var err error
169- csvList , err = me .readCSV (filename )
170- if err != nil {
171- return nil , err
172- }
173- } else {
174- csvList = me .csvAllreadyRead [filename ]
162+ csvList , err := me .readCSV (filename )
163+ if err != nil {
164+ return nil , err
175165 }
176166
177167 var records []JaccardCSV
178- for _ , record := range csvList {
168+ for i := 0 ; i < csvList .Len (); i ++ {
169+ record := csvList .Get (i )
179170 lineKey , err := me .computeCSVLineKey (record , false )
180171 if err != nil {
181172 return nil , err
@@ -198,46 +189,45 @@ func (me *MaskEngine) getJaccardMatchResults(filename string, exactMatchResults
198189 return sortBySimilarity (jaccardEntryString , records ), nil
199190}
200191
201- func (me * MaskEngine ) ExactMatch (filename string , context []model.Dictionary ) (bool , []model.Entry , error ) {
192+ func (me * MaskEngine ) exactMatch (filename fileuri , context []model.Dictionary ) (bool , []model.Entry , error ) {
202193 if me .temExactMatchEntry != nil && me .temExactMatchCSV != nil {
203- var csvList []model.Dictionary
204- if _ , ok := me .csvAllreadyRead [filename ]; ! ok {
205- var err error
206- csvList , err = me .readCSV (filename )
207- if err != nil {
208- return false , nil , err
209- }
194+ csvList , err := me .readCSV (filename )
195+ if err != nil {
196+ return false , nil , err
210197 }
211198
212199 var exactEntryBuffer bytes.Buffer
213200 if err := me .temExactMatchEntry .Execute (& exactEntryBuffer , context [0 ].UnpackUnordered ()); err != nil {
214201 return false , nil , err
215202 }
216- exactEntryString := exactEntryBuffer .String ()
217- err : = me .getExactMatchCsvResult (filename , csvList )
203+ exactEntryString := linekey ( exactEntryBuffer .String () )
204+ err = me .getExactMatchCsvResult (filename , csvList )
218205 if err != nil {
219206 return false , []model.Entry {}, err
220207 }
221208
222- results := me .csvEntryByKey [CSVKey {
223- filename : filename ,
224- lineKey : exactEntryString ,
225- }]
226- if len (results ) < 1 {
227- return false , results , nil
228- }
229- return true , results , nil
209+ results := me .readCsvEntryByKey (filename , exactEntryString )
210+
211+ return len (results ) > 0 , results , nil
230212 }
231213 return true , []model.Entry {}, nil
232214}
233215
234- func (me * MaskEngine ) readCSV (filename string ) ([]model.Dictionary , error ) {
235- recordsFromFile , err := uri .ReadCsv (filename , me .sep , me .comment , me .fieldsPerRecord , me .trimSpaces )
216+ func (me * MaskEngine ) readCsvEntryByKey (filename fileuri , exactEntryString linekey ) []model.Entry {
217+ cache , cacheExists := me .csvEntryByKey [filename ]
218+ if ! cacheExists {
219+ panic ("csv file is not cached, please report the bug on GitHub CGI-FR" )
220+ }
221+
222+ return cache [exactEntryString ]
223+ }
224+
225+ func (me * MaskEngine ) readCSV (filename fileuri ) (uri.DictRecords , error ) {
226+ recordsFromFile , err := uri .ReadCsvAsDicts (string (filename ), me .sep , me .comment , me .fieldsPerRecord , me .trimSpaces , me .header )
236227 if err != nil {
237228 return nil , err
238229 }
239- csvList := me .createEntriesFromCSVLines (recordsFromFile )
240- return csvList , nil
230+ return recordsFromFile , nil
241231}
242232
243233func (me * MaskEngine ) computeCSVLineKey (record model.Dictionary , exactMatch bool ) (string , error ) {
@@ -258,60 +248,32 @@ func (me *MaskEngine) computeCSVLineKey(record model.Dictionary, exactMatch bool
258248 return output .String (), nil
259249}
260250
261- func (me * MaskEngine ) getExactMatchCsvResult (filename string , csvList []model.Dictionary ) error {
262- for _ , record := range csvList {
263- lineKey , err := me .computeCSVLineKey (record , true )
264- if err != nil {
265- return err
266- }
251+ func (me * MaskEngine ) getExactMatchCsvResult (filename fileuri , csvList uri.DictRecords ) error {
252+ _ , cacheExists := me .csvEntryByKey [filename ]
253+ if ! cacheExists {
254+ cache := map [linekey ][]model.Entry {}
267255
268- key := CSVKey {
269- filename : filename ,
270- lineKey : lineKey ,
271- }
256+ for i := 0 ; i < csvList .Len (); i ++ {
257+ record := csvList .Get (i )
258+ lineKey , err := me .computeCSVLineKey (record , true )
259+ if err != nil {
260+ return err
261+ }
272262
273- if records , ok := me .csvEntryByKey [key ]; ok {
274- records = append (records , record )
275- me .csvEntryByKey [key ] = records
276- } else {
277- me .csvEntryByKey [key ] = []model.Entry {record }
263+ if records , ok := cache [linekey (lineKey )]; ok {
264+ records = append (records , record )
265+ cache [linekey (lineKey )] = records
266+ } else {
267+ cache [linekey (lineKey )] = []model.Entry {record }
268+ }
278269 }
270+
271+ me .csvEntryByKey [filename ] = cache
279272 }
280273
281274 return nil
282275}
283276
284- func (me * MaskEngine ) createEntriesFromCSVLines (records uri.CSVRecords ) []model.Dictionary {
285- results := []model.Dictionary {}
286-
287- for i := 0 ; i < records .Len (); i ++ {
288- record := records .Get (i )
289- if me .header {
290- obj := model .NewDictionary ()
291- headers := records .Get (0 )
292- for i , header := range headers {
293- if me .trimSpaces {
294- obj .Set (strings .TrimSpace (header ), strings .TrimSpace (record [i ]))
295- } else {
296- obj .Set (header , record [i ])
297- }
298- }
299- results = append (results , obj )
300- } else {
301- obj := model .NewDictionary ()
302- for i , value := range record {
303- if me .trimSpaces {
304- obj .Set (strconv .Itoa (i ), strings .TrimSpace (value ))
305- } else {
306- obj .Set (strconv .Itoa (i ), value )
307- }
308- }
309- results = append (results , obj )
310- }
311- }
312- return results
313- }
314-
315277// Get numbers of result waited in expected config, by default return as at-least-one
316278func (me * MaskEngine ) getExpectedResult (results []model.Entry ) (model.Entry , error ) {
317279 resultCount := len (results )
@@ -337,7 +299,7 @@ func (me *MaskEngine) getExpectedResult(results []model.Entry) (model.Entry, err
337299}
338300
339301// JaccardSimilarity calculates the Jaccard similarity between two strings.
340- func JaccardSimilarity (s1 , s2 string ) float64 {
302+ func jaccardSimilarity (s1 , s2 string ) float64 {
341303 if s1 == s2 {
342304 return 1.0
343305 }
@@ -398,7 +360,7 @@ func sortBySimilarity(jaccardEntryString string, list []JaccardCSV) []model.Entr
398360 var entriesWithSimilarity []EntryWithSimilarity
399361
400362 for _ , record := range list {
401- similarity := JaccardSimilarity (jaccardEntryString , record .lineKey )
363+ similarity := jaccardSimilarity (jaccardEntryString , record .lineKey )
402364 entriesWithSimilarity = append (entriesWithSimilarity , EntryWithSimilarity {Key : record .lineKey , Entry : record .csvLine , Similarity : similarity })
403365 }
404366
0 commit comments