Skip to content

Commit

Permalink
Land a new pipeline for calculating fingerprints.
Browse files Browse the repository at this point in the history
This now snapshots the IDFMap and updates it less often. It doesn't slow down loading, but it does improve the time to save a card from ~11 seconds to ~5 seconds.

Before, the IDFMap would be update every time any card changed. This meant that when a card was saved, the IDFMap was updated (a possibly expensive operation) which then invalidated all other fingerprints. But the IDFMap, for large webs, rarely changes much.

Now we only update the IDFMap if the set of changed card is greater than 10% of the size of cards. This means that after saving a single card, we use the pre-existing IDFMap.

This makes the common case of saving cards significantly more snappy.

Part of #694.

Merge branch 'fingerprint-performance'

* fingerprint-performance:
  Bring the load performance in line with what is was previously.
  Fix a performance issue when saving a card.
  Change the order of cardTFIDF and fingerprintForTFIDF to allow memoizeFirstArg.
  Wrap fingerprintForCardObj in mwmoizeFirstArg.
  Change it so cardObj is first argument for fingerprintForCardObj.
  Pop out fingerprintForCardObj.
  Factor out fingerprintForTFIDF and cardTFIDF to not be on fingerprint generator.
  Get rid of an unnecessary generator argument to Fingerprint constructor.
  Put idfMap calculation behind a memoization layer.
  Refactor it so the idfmap is calculated outside fingerprint genrator.
  • Loading branch information
jkomoros committed Oct 6, 2024
2 parents 6ca1fdd + 7fb68f7 commit d9acd86
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 101 deletions.
17 changes: 14 additions & 3 deletions src/memoize.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
import { deepEqual } from './util.js';

const arrayEqual = (a : unknown[], b : unknown[]) : boolean => {
if (a.length != b.length) return false;
return a.every((a,index) => a === b[index]);
const trimUndefined = (arr: unknown[]): unknown[] => {
let lastDefinedIndex = arr.length - 1;
while (lastDefinedIndex >= 0 && arr[lastDefinedIndex] === undefined) {
lastDefinedIndex--;
}
return arr.slice(0, lastDefinedIndex + 1);
};

//arrayEqual will return true if the arrays are equal, ignoring undefined values at the end.
const arrayEqual = (a: unknown[], b: unknown[]): boolean => {
const trimmedA = trimUndefined(a);
const trimmedB = trimUndefined(b);
if (trimmedA.length !== trimmedB.length) return false;
return trimmedA.every((value, index) => value === trimmedB[index]);
};

//Like memoize, except the first argument is expected to be a thing that changes
Expand Down
204 changes: 106 additions & 98 deletions src/nlp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,7 @@ export const possibleMissingConcepts = (cards : ProcessedCards) : Fingerprint =>
}

const resultMap = new Map(finalNgrams.map(ngram => [ngram, ngramBundles[ngram].scoreForBundle]));
return new Fingerprint(resultMap, Object.values(cards), maximumFingerprintGenerator);
return new Fingerprint(resultMap, Object.values(cards));
};

//suggestConceptReferencesForCard is very expensive, so memoize it.
Expand Down Expand Up @@ -1410,14 +1410,12 @@ const SEMANTIC_FINGERPRINT_MATCH_CONSTANT = 1.0;
export class Fingerprint {

_cards : ProcessedCard[];
_generator : FingerprintGenerator | undefined;
_items : Map<string, number>;
_memoizedWordCloud : WordCloud | null;
_memoizedFullWordCloud : WordCloud | null;

constructor(items? : Map<string, number>, cardOrCards? : ProcessedCard | ProcessedCard[], generator? : FingerprintGenerator) {
constructor(items? : Map<string, number>, cardOrCards? : ProcessedCard | ProcessedCard[]) {
this._cards = Array.isArray(cardOrCards) ? cardOrCards : (cardOrCards ? [cardOrCards] : []);
this._generator = generator;
this._items = items || new Map();
this._memoizedWordCloud = null;
this._memoizedFullWordCloud = null;
Expand Down Expand Up @@ -1600,114 +1598,121 @@ type WordNumbers = {
[word : string] : number
};

export class FingerprintGenerator {

_cards? : ProcessedCards;
_idfMap : {
[ngram : string] : number
};
_fingerprintSize : number;
_ngramSize : number;
_maxIDF : number;
_fingerprints : {
[id : CardID] : Fingerprint
};

constructor(cards? : ProcessedCards, optFingerprintSize : number = SEMANTIC_FINGERPRINT_SIZE, optNgramSize : number = MAX_N_GRAM_FOR_FINGERPRINT) {

this._cards = cards;
this._idfMap = {};
this._fingerprints = {};
this._fingerprintSize = optFingerprintSize;
this._ngramSize = optNgramSize;
type IDFMap = {
idf: WordNumbers,
maxIDF: number
};

if (!cards || Object.keys(cards).length == 0) return;
let memoizedIDFMap: IDFMap = {idf: {}, maxIDF: 0};
let memoizedIDMapCardCount = 0;
let memoizedIDFMapNgramSize = 0;

const idfMapForCards = (cards : ProcessedCards, ngramSize: number) : IDFMap => {
if (!cards || Object.keys(cards).length == 0) return {idf: {}, maxIDF: 0};
const cardCount = Object.keys(cards).length;
//Check if the card count is greater than or equal to card count and within 10% of the last time we calculated the idf map
const cardCountCloseEnough = cardCount >= memoizedIDMapCardCount && cardCount <= memoizedIDMapCardCount * 1.1;
if (cardCountCloseEnough && ngramSize == memoizedIDFMapNgramSize) return memoizedIDFMap;
const result = calcIDFMapForCards(cards, ngramSize);
memoizedIDFMap = result;
memoizedIDFMapNgramSize = ngramSize;
memoizedIDMapCardCount = cardCount;
return result;
};

//only consider cards that have a body, even if we were provided a set that included others
cards = Object.fromEntries(Object.entries(cards).filter(entry => BODY_CARD_TYPES[entry[1].card_type]));
const calcIDFMapForCards = (cards : ProcessedCards, ngramSize: number) : IDFMap => {
//only consider cards that have a body, even if we were provided a set that included others
cards = Object.fromEntries(Object.entries(cards).filter(entry => BODY_CARD_TYPES[entry[1].card_type]));

const numCards = Object.keys(cards).length;
const numCards = Object.keys(cards).length;

//cardWords is a object that contains an object for each card id of
//words to their count in that card. This uses all words htat could be
//searched over, and is the input to the IDF calculation pipeline and
//others.
const cardWordCounts : {[cardID : CardID]: {[word : string] : number}} = {};
for (const [key, cardObj] of Object.entries(cards)) {
cardWordCounts[key] = this._wordCountsForCardObj(cardObj);
}

//corpusWords is a set of word => numCardsContainWord, that is, the
//number of cards that contain the term at least once. This is how idf
//is normally calculated; we previously used the raw count of times it
//showed up.
const corpusWords : WordNumbers = {};
for (const words of Object.values(cardWordCounts)) {
for (const word of Object.keys(words)) {
corpusWords[word] = (corpusWords[word] || 0) + 1;
}
}
//cardWords is a object that contains an object for each card id of
//words to their count in that card. This uses all words htat could be
//searched over, and is the input to the IDF calculation pipeline and
//others.
const cardWordCounts : {[cardID : CardID]: {[word : string] : number}} = {};
for (const [key, cardObj] of Object.entries(cards)) {
cardWordCounts[key] = wordCountsForSemantics(cardObj, ngramSize);
}

//idf (inverse document frequency) of every word in the corpus. See
//https://en.wikipedia.org/wiki/Tf%E2%80%93idf
const idf : WordNumbers = {};
let maxIDF = 0;
for (const [word, count] of Object.entries(corpusWords)) {
idf[word] = Math.log10(numCards / (count + 1));
if (idf[word] > maxIDF) maxIDF = idf[word];
//corpusWords is a set of word => numCardsContainWord, that is, the
//number of cards that contain the term at least once. This is how idf
//is normally calculated; we previously used the raw count of times it
//showed up.
const corpusWords : WordNumbers = {};
for (const words of Object.values(cardWordCounts)) {
for (const word of Object.keys(words)) {
corpusWords[word] = (corpusWords[word] || 0) + 1;
}
//This is useful often so stash it
this._idfMap = idf;
this._maxIDF = maxIDF;

//A map of cardID to the semantic fingerprint for that card.
const fingerprints : {[cardID : CardID] : Fingerprint} = {};
for (const [cardID, cardWordCount] of Object.entries(cardWordCounts)) {
//See https://en.wikipedia.org/wiki/Tf%E2%80%93idf for more on
//TF-IDF.
const tfidf = this._cardTFIDF(cardWordCount);
fingerprints[cardID] = this._fingerprintForTFIDF(tfidf, cards[cardID]);
}
this._fingerprints = fingerprints;
}

_fingerprintForTFIDF(tfidf : WordNumbers, cardOrCards : ProcessedCard | ProcessedCard[]) {
//Pick the keys for the items with the highest tfidf (the most important and specific to that card)
const keys = Object.keys(tfidf).sort((a, b) => tfidf[b] - tfidf[a]).slice(0, this.fingerprintSize());
const items = new Map(keys.map(key => [key, tfidf[key]]));
return new Fingerprint(items, cardOrCards, this);
//idf (inverse document frequency) of every word in the corpus. See
//https://en.wikipedia.org/wiki/Tf%E2%80%93idf
const idf : WordNumbers = {};
let maxIDF = 0;
for (const [word, count] of Object.entries(corpusWords)) {
idf[word] = Math.log10(numCards / (count + 1));
if (idf[word] > maxIDF) maxIDF = idf[word];
}
return {idf, maxIDF};
};

_wordCountsForCardObj(cardObj : ProcessedCard, optFieldList? : CardFieldType[]) {
//Filter out empty items for properties that don't have any items
return wordCountsForSemantics(cardObj, this._ngramSize, optFieldList);
const fingerprintForTFIDF = (tfidf : WordNumbers, cardOrCards : ProcessedCard | ProcessedCard[], fingerprintSize : number) => {
//Pick the keys for the items with the highest tfidf (the most important and specific to that card)
const keys = Object.keys(tfidf).sort((a, b) => tfidf[b] - tfidf[a]).slice(0, fingerprintSize);
const items = new Map(keys.map(key => [key, tfidf[key]]));
return new Fingerprint(items, cardOrCards);
};

const cardTFIDF = (cardWordCounts : WordNumbers, i : IDFMap) : WordNumbers => {
const idfMap = i.idf;
const maxIDF = i.maxIDF;
const resultTFIDF : WordNumbers = {};
const cardWordCount = Object.values(cardWordCounts).reduce((prev, curr) => prev + curr, 0);
for (const [word, count] of TypedObject.entries(cardWordCounts)) {
//_idfMap should very often have all of the terms, but it can be
//missing one if we're using fingerprintForCardObj for a live
//editing card, and if it just had text added to it that inludes
//uni-grams or bigrams that are so distinctive that they haven't
//been seen before. In that case we'll use the highest IDF we've
//seen in this corpus.
resultTFIDF[word] = (count / cardWordCount) * (idfMap[word] || maxIDF);
}
return resultTFIDF;
};

_cardTFIDF(cardWordCounts : WordNumbers) : WordNumbers {
const resultTFIDF : WordNumbers = {};
const cardWordCount = Object.values(cardWordCounts).reduce((prev, curr) => prev + curr, 0);
for (const [word, count] of TypedObject.entries(cardWordCounts)) {
//_idfMap should very often have all of the terms, but it can be
//missing one if we're using fingerprintForCardObj for a live
//editing card, and if it just had text added to it that inludes
//uni-grams or bigrams that are so distinctive that they haven't
//been seen before. In that case we'll use the highest IDF we've
//seen in this corpus.
resultTFIDF[word] = (count / cardWordCount) * (this._idfMap[word] || this._maxIDF);
}
return resultTFIDF;
const fingerprintForCardObj = memoizeFirstArg((cardObj : ProcessedCard, idfMap : IDFMap, fingerprintSize : number, ngramSize : number, optFieldList? : CardFieldType[]) => {
if (!cardObj || Object.keys(cardObj).length == 0) return new Fingerprint();
const wordCounts = wordCountsForSemantics(cardObj, ngramSize, optFieldList);
const tfidf = cardTFIDF(wordCounts, idfMap);
const fingerprint = fingerprintForTFIDF(tfidf, cardObj, fingerprintSize);
return fingerprint;
});

export class FingerprintGenerator {

_cards : ProcessedCards;
_idfMap : IDFMap;
_fingerprintSize : number;
_ngramSize : number;
_cachedFingerprints? : {[cardID : string] : Fingerprint};

constructor(cards? : ProcessedCards, optFingerprintSize : number = SEMANTIC_FINGERPRINT_SIZE, optNgramSize : number = MAX_N_GRAM_FOR_FINGERPRINT) {
this._cards = cards || {};
this._ngramSize = optNgramSize;
this._idfMap = idfMapForCards(this._cards, this._ngramSize);
this._fingerprintSize = optFingerprintSize;
}

fingerprintForCardID(cardID : CardID) : Fingerprint {
return this.fingerprints()[cardID];
fingerprintForCardObj(cardObj : ProcessedCard, optFieldList? : CardFieldType[] ) : Fingerprint {
//A convenience method for other contexts that need to call this.
return fingerprintForCardObj(cardObj, this._idfMap, this._fingerprintSize, this._ngramSize, optFieldList);
}

fingerprintForCardObj(cardObj : ProcessedCard, optFieldList? : CardFieldType[]) {
if (!cardObj || Object.keys(cardObj).length == 0) return new Fingerprint();
const wordCounts = this._wordCountsForCardObj(cardObj, optFieldList);
const tfidf = this._cardTFIDF(wordCounts);
const fingerprint = this._fingerprintForTFIDF(tfidf, cardObj);
return fingerprint;
fingerprintForCardID(cardID : CardID) : Fingerprint {
const card = this._cards[cardID];
if (!card) return new Fingerprint();
return fingerprintForCardObj(card, this._idfMap, this._fingerprintSize, this._ngramSize);
}

fingerprintForCardIDList(cardIDs : CardID[]) : Fingerprint {
Expand All @@ -1724,12 +1729,15 @@ export class FingerprintGenerator {
combinedTFIDF[word] = (combinedTFIDF[word] || 0) + idf;
}
}
return this._fingerprintForTFIDF(combinedTFIDF, cardIDs.map(id => cards[id]));
return fingerprintForTFIDF(combinedTFIDF, cardIDs.map(id => cards[id]), this._fingerprintSize);
}

//returns a map of cardID => fingerprint for the cards that were provided to the constructor
fingerprints() {
return this._fingerprints;
if (this._cachedFingerprints) return this._cachedFingerprints;
const fingerprints = Object.fromEntries(Object.keys(this._cards).map(cardID => [cardID, this.fingerprintForCardID(cardID)]));
this._cachedFingerprints = fingerprints;
return fingerprints;
}

fingerprintSize() {
Expand Down

0 comments on commit d9acd86

Please sign in to comment.