4 # FILE: SPT--Recommender.php
9 # SomeMethod($SomeParameter, $AnotherParameter)
10 # - short description of method
12 # AUTHOR: Edward Almasy
14 # Part of the Scout Portal Toolkit
15 # Copyright 2002-2004 Internet Scout Project
16 # http://scout.wisc.edu
21 # ---- PUBLIC INTERFACE --------------------------------------------------
22 # define content field types
34 # set default parameters
35 $this->ContentCorrelationThreshold = 1;
37 # save database object
40 # save new configuration values
48 # set default debug state
52 # set level for debugging output
59 # ---- recommendation methods
61 # recommend items for specified user
62 function Recommend($UserId, $StartingResult = 0, $NumberOfResults = 10)
64 if ($this->
DebugLevel > 0) { print(
"REC: Recommend(${UserId}, ${StartingResult}, ${NumberOfResults})<br>\n"); }
66 # load in user ratings
69 $DB->Query(
"SELECT ".$this->ItemIdFieldName.
", ".$this->RatingFieldName
70 .
" FROM ".$this->RatingTableName
71 .
" WHERE ".$this->UserIdFieldName.
" = ${UserId}");
72 while ($Row =
$DB->FetchRow())
77 if ($this->
DebugLevel > 1) { print(
"REC: user has rated ".count($Ratings).
" items<br>\n"); }
79 # for each item that user has rated
81 foreach ($Ratings as $ItemId => $ItemRating)
83 # for each content correlation available for that item
84 $DB->Query(
"SELECT Correlation, ItemIdB "
85 .
"FROM RecContentCorrelations "
86 .
"WHERE ItemIdA = ${ItemId}");
87 while ($Row =
$DB->FetchRow())
89 # multiply that correlation by normalized rating and add
90 # resulting value to recommendation value for that item
91 if (isset($RecVals[$Row[
"ItemIdB"]]))
93 $RecVals[$Row[
"ItemIdB"]] +=
94 $Row[
"Correlation"] * ($ItemRating - 50);
98 $RecVals[$Row[
"ItemIdB"]] =
99 $Row[
"Correlation"] * ($ItemRating - 50);
101 if ($this->
DebugLevel > 9) { print(
"REC: RecVal[".$Row[
"ItemIdB"].
"] = ".$RecVals[$Row[
"ItemIdB"]].
"<br>\n"); }
104 if ($this->
DebugLevel > 1) { print(
"REC: found ".count($RecVals).
" total recommendations<br>\n"); }
106 # calculate average correlation between items
107 $ResultThreshold =
$DB->Query(
"SELECT AVG(Correlation) "
108 .
"AS Average FROM RecContentCorrelations",
"Average");
109 $ResultThreshold = round($ResultThreshold) * 2;
111 # for each recommended item
112 foreach ($RecVals as $ItemId => $RecVal)
114 # remove item from list if user already rated it
115 if (isset($Ratings[$ItemId]))
117 unset($RecVals[$ItemId]);
121 # scale recommendation value back to match thresholds
122 $RecVals[$ItemId] = round($RecVal / 50);
124 # remove item from recommendation list if value is below threshold
125 if ($RecVals[$ItemId] < $ResultThreshold)
127 unset($RecVals[$ItemId]);
131 if ($this->
DebugLevel > 1) { print(
"REC: found ".count($RecVals).
" positive recommendations<br>\n"); }
133 # sort recommendation list by value
134 if (isset($RecVals)) { arsort($RecVals, SORT_NUMERIC); }
136 # save total number of results available
137 $this->NumberOfResultsAvailable = count($RecVals);
139 # trim result list to match range requested by caller
140 $RecValKeys = array_slice(
141 array_keys($RecVals), $StartingResult, $NumberOfResults);
142 $RecValSegment = array();
143 foreach ($RecValKeys as $Key)
145 $RecValSegment[$Key] = $RecVals[$Key];
148 # return recommendation list to caller
149 return $RecValSegment;
152 # add function to be called to filter returned recommendation list
155 # save filter function name
156 $this->FilterFuncs[] = $FunctionName;
159 # return number of recommendations generated
165 # return recommendation generation time
171 # return list of items used to generate recommendation of specified item
174 # pull list of correlations from DB
175 $this->DB->Query(
"SELECT * FROM RecContentCorrelations, ".$this->RatingTableName
176 .
" WHERE (ItemIdA = ${RecommendedItemId}"
177 .
" OR ItemIdB = ${RecommendedItemId})"
178 .
" AND ".$this->UserIdFieldName.
" = ".$UserId
179 .
" AND (RecContentCorrelations.ItemIdA = ".$this->RatingTableName.
".".$this->ItemIdFieldName
180 .
" OR RecContentCorrelations.ItemIdB = ".$this->RatingTableName.
".".$this->ItemIdFieldName.
")"
181 .
" AND Rating >= 50 "
182 .
" ORDER BY Correlation DESC");
184 # for each correlation
185 $SourceList = array();
186 while ($Row = $this->DB->FetchRow())
188 # pick out appropriate item ID
189 if ($Row[
"ItemIdA"] == $RecommendedItemId)
191 $ItemId = $Row[
"ItemIdB"];
195 $ItemId = $Row[
"ItemIdA"];
198 # add item to recommendation source list
199 $SourceList[$ItemId] = $Row[
"Correlation"];
202 # return recommendation source list to caller
206 # dynamically generate and return list of items similar to specified item
209 if ($this->
DebugLevel > 1) { print(
"REC: searching for items similar to item \"".$ItemId.
"\"<br>\n"); }
211 # make sure we have item IDs available
214 # start with empty array
215 $SimilarItems = array();
218 foreach ($this->ItemIds as $Id)
220 # if item is not specified item
223 # calculate correlation of item to specified item
226 # if correlation is above threshold
227 if ($Correlation > $this->ContentCorrelationThreshold)
229 # add item to list of similar items
230 $SimilarItems[$Id] = $Correlation;
234 if ($this->
DebugLevel > 3) { print(
"REC: ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" found<br>\n"); }
236 # filter list of similar items (if any)
237 if (count($SimilarItems) > 0)
240 if ($this->
DebugLevel > 4) { print(
"REC: ".count($SimilarItems).
" similar items to item \"".$ItemId.
"\" left after filtering<br>\n"); }
243 # if any similar items left
244 if (count($SimilarItems) > 0)
246 # sort list of similar items in order of most to least similar
247 arsort($SimilarItems, SORT_NUMERIC);
250 # return list of similar items to caller
251 return $SimilarItems;
254 # dynamically generate and return list of recommended field values for item
257 if ($this->
DebugLevel > 1) { print(
"REC: generating field value recommendations for item \"".$ItemId.
"\"<br>\n"); }
259 # start with empty array of values
262 # generate list of similar items
265 # if similar items found
266 if (count($SimilarItems) > 0)
268 # prune list of similar items to only top third of better-than-average
269 $AverageCorr = intval(array_sum($SimilarItems) / count($SimilarItems));
270 reset($SimilarItems);
271 $HighestCorr = current($SimilarItems);
272 $CorrThreshold = intval($HighestCorr - (($HighestCorr - $AverageCorr) / 3));
273 if ($this->
DebugLevel > 8) { print(
"REC: <i>Average Correlation: $AverageCorr Highest Correlation: $HighestCorr Correlation Threshold: $CorrThreshold </i><br>\n"); }
274 foreach ($SimilarItems as $ItemId => $ItemCorr)
276 if ($ItemCorr < $CorrThreshold)
278 unset($SimilarItems[$ItemId]);
281 if ($this->
DebugLevel > 6) { print(
"REC: ".count($SimilarItems).
" similar items left after threshold pruning<br>\n"); }
284 foreach ($SimilarItems as $SimItemId => $SimItemCorr)
287 foreach ($this->ContentFields as $FieldName => $FieldAttributes)
289 # load field data for this item
290 $FieldData = $this->GetFieldValue($SimItemId, $FieldName);
292 # if field data is array
293 if (is_array($FieldData))
295 # for each field data value
296 foreach ($FieldData as $FieldDataVal)
298 # if data value is not empty
299 $FieldDataVal = trim($FieldDataVal);
300 if (strlen($FieldDataVal) > 0)
302 # increment count for data value
303 $RecVals[$FieldName][$FieldDataVal]++;
309 # if data value is not empty
310 $FieldData = trim($FieldData);
311 if (strlen($FieldData) > 0)
313 # increment count for data value
314 $RecVals[$FieldName][$FieldData]++;
321 $MatchingCountThreshold = 3;
322 foreach ($RecVals as $FieldName => $FieldVals)
324 # determine cutoff threshold
325 arsort($FieldVals, SORT_NUMERIC);
327 $HighestCount = current($FieldVals);
328 $AverageCount = intval(array_sum($FieldVals) / count($FieldVals));
329 $CountThreshold = intval($AverageCount + (($HighestCount - $AverageCount) / 2));
330 if ($CountThreshold < $MatchingCountThreshold) { $CountThreshold = $MatchingCountThreshold; }
331 if ($this->
DebugLevel > 8) { print(
"REC: <i>Field: $FieldName Average Count: $AverageCount Highest Count: $HighestCount Count Threshold: $CountThreshold </i><br>\n"); }
333 # for each field data value
334 foreach ($FieldVals as $FieldVal => $FieldValCount)
336 # if value count is below threshold
337 if ($FieldValCount < $CountThreshold)
340 unset($RecVals[$FieldName][$FieldVal]);
344 if ($this->
DebugLevel > 3) { print(
"REC: found ".count($RecVals[$FieldName]).
" recommended values for field \"".$FieldName.
"\" after threshold pruning<br>\n"); }
348 # return recommended values to caller
353 # ---- database update methods
357 if ($this->
DebugLevel > 0) { print(
"REC: UpdateForItems(${StartingItemId}, ${NumberOfItems})<br>\n"); }
358 # make sure we have item IDs available
364 foreach ($this->ItemIds as $ItemId)
366 # if item ID is within requested range
367 if ($ItemId >= $StartingItemId)
369 # update recommender info for item
370 if ($this->
DebugLevel > 1) { print(
"REC: doing item ${ItemId}<br>\n"); }
374 # if we have done requested number of items
375 if ($ItemsUpdated >= $NumberOfItems)
378 if ($this->
DebugLevel > 1) { print(
"REC: bailing out with item ${ItemId}<br>\n"); }
384 # return ID of last resource updated to caller
390 if ($this->
DebugLevel > 1) { print(
"REC: updating for item \"".$ItemId.
"\"<br>\n"); }
392 # make sure we have item IDs available
395 # clear existing correlations for this item
396 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
397 .
"WHERE ItemIdA = ${ItemId}");
400 foreach ($this->ItemIds as $Id)
402 # if full pass and item is later in list than current item
403 if (($FullPass == FALSE) || ($Id > $ItemId))
405 # update correlation value for item and target item
413 # drop all correlation entries referring to item
414 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
415 .
"WHERE ItemIdA = ".$ItemId.
" "
416 .
"OR ItemIdB = ".$ItemId);
421 # get average correlation
422 $AverageCorrelation = $this->DB->Query(
"SELECT AVG(Correlation) "
423 .
"AS Average FROM RecContentCorrelations",
"Average");
425 # dump all below-average correlations
426 if ($AverageCorrelation > 0)
428 $this->DB->Query(
"DELETE FROM RecContentCorrelations "
429 .
"WHERE Correlation <= ${AverageCorrelation}");
442 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
443 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
444 $ItemIds = $this->DB->FetchColumn(
"Id");
450 # ---- PRIVATE INTERFACE -------------------------------------------------
469 # if item IDs not already loaded
470 if (!isset($this->ItemIds))
472 # load item IDs from DB
473 $this->DB->Query(
"SELECT ".$this->ItemIdFieldName.
" AS Id FROM "
474 .$this->ItemTableName.
" ORDER BY ".$this->ItemIdFieldName);
475 $this->ItemIds = array();
476 while ($Item = $this->DB->FetchRow())
478 $this->ItemIds[] = $Item[
"Id"];
486 static $CachedItemList;
488 # if data not already loaded
489 if (!isset($ItemData[$ItemId][$FieldName]))
491 # load field value from DB
492 $FieldValue = $this->GetFieldValue($ItemId, $FieldName);
494 # if field value is array
495 if (is_array($FieldValue))
497 # concatenate together text from array elements
498 $FieldValue = implode(
" ", $FieldValue);
501 # normalize text and break into word array
504 # if more items than cache limit
505 if (count($ItemData) > 1000)
509 list($DumpedItemId, $DumpedItemData) = each($ItemData);
510 unset($ItemData[$DumpedItemId]);
514 # return cached data to caller
515 return $ItemData[$ItemId][$FieldName];
518 # calculate content correlation between two items and return value to caller
521 static $CorrelationCache;
523 if ($this->
DebugLevel > 10) { print(
"REC: calculating correlation between items $ItemIdA and $ItemIdB<br>\n"); }
525 # order item ID numbers
526 if ($ItemIdA > $ItemIdB)
533 # if we already have the correlation
534 if (isset($CorrelationCache[$ItemIdA][$ItemIdB]))
536 # retrieve correlation from cache
537 $TotalCorrelation = $CorrelationCache[$ItemIdA][$ItemIdB];
541 # if list of fields to correlate specified
542 if ($FieldList != NULL)
544 # create list with only specified fields
545 foreach ($FieldList as $FieldName)
556 # for each content field
557 $TotalCorrelation = 0;
560 # if field is of a type that we use for correlation
561 $FieldType = intval($FieldAttributes[
"FieldType"]);
568 if ($this->
DebugLevel > 15) { print(
"REC: loaded ".count($ItemAData).
" terms for item #".$ItemIdA.
" and ".count($ItemBData).
" terms for item #".$ItemIdB.
" for field \"".$FieldName.
"\"<br>\n"); }
570 # call appropriate routine to get correlation
576 $ItemAData, $ItemBData);
580 # add correlation multiplied by weight to total
581 $TotalCorrelation += $Correlation * $FieldAttributes[
"Weight"];
585 # store correlation to cache
586 $CorrelationCache[$ItemIdA][$ItemIdB] = $TotalCorrelation;
589 # return correlation value to caller
590 if ($this->
DebugLevel > 9) { print(
"REC: correlation between items $ItemIdA and $ItemIdB found to be $TotalCorrelation<br>\n"); }
591 return $TotalCorrelation;
594 # calculate content correlation between two items and update in DB
597 if ($this->
DebugLevel > 6) { print(
"REC: updating correlation between items $ItemIdA and $ItemIdB<br>\n"); }
599 # bail out if two items are the same
600 if ($ItemIdA == $ItemIdB) {
return; }
602 # calculate correlation
605 # save new correlation
679 # strip any HTML tags
680 $Text = strip_tags($Text);
682 # strip any punctuation
683 $Text = preg_replace(
"/,\\.\\?-\\(\\)\\[\\]\"/",
" ", $Text); #
"
685 # normalize whitespace
686 $Text = trim(preg_replace("/[\\s]+/
", " ", $Text));
688 # convert to all lower case
689 $Text = strtolower($Text);
691 # split text into arrays of words
692 $Words = explode(" ", $Text);
694 # filter out all stop words
695 $Words = array_diff($Words, $StopWords);
697 # return word array to caller
701 function CalcTextCorrelation($WordsA, $WordsB)
703 # get array containing intersection of two word arrays
704 $IntersectWords = array_intersect($WordsA, $WordsB);
706 # return number of words remaining as score
707 return count($IntersectWords);
710 function ContentCorrelation($ItemIdA, $ItemIdB, $NewCorrelation = -1)
712 # if item ID A is greater than item ID B
713 if ($ItemIdA > $ItemIdB)
721 # if new correlation value provided
722 if ($NewCorrelation != -1)
724 # if new value is above threshold
725 if ($NewCorrelation >= $this->ContentCorrelationThreshold)
727 # insert new correlation value in DB
728 $this->DB->Query("INSERT INTO RecContentCorrelations
"
729 ."(ItemIdA, ItemIdB, Correlation)
"
730 ."VALUES (${ItemIdA}, ${ItemIdB}, ${NewCorrelation})
");
732 # return correlation value is new value
733 $Correlation = $NewCorrelation;
738 # return value is zero
744 # retrieve correlation value from DB
745 $Correlation = $this->DB->Query(
746 "SELECT Correlation FROM RecContentCorrelations
"
747 ."WHERE ItemIdA = ${ItemIdA} AND ItemIdB = ${ItemIdB}
",
750 # if no value found in DB
751 if ($Correlation == FALSE)
753 # return value is zero
758 # return correlation value to caller
762 function FilterOnSuppliedFunctions($Results)
764 # if filter functions have been set
765 if (count($this->FilterFuncs) > 0)
768 foreach ($Results as $ResourceId => $Result)
770 # for each filter function
771 foreach ($this->FilterFuncs as $FuncName)
773 # if filter function return TRUE for result resource
774 if ($FuncName($ResourceId))
777 if ($this->DebugLevel > 2) { print("REC: filter callback rejected resource ${ResourceId}<br>\n
"); }
778 unset($Results[$ResourceId]);
780 # bail out of filter func loop
787 # return filtered list to caller