Tuesday, 18 March 2014

Keyword Relevancy Algorithms

A small and simple algorithm in order to find relevant keywords in a text works as follows:
  1. Initialising array of stop words and keywords,
  2. Text preprocessing, such as lowercasing and removing peculiars,
  3. Determining general text statistics, such as number of characters,
  4. Removing stop words,
  5. Exploding text to keyword array,
  6. Iterating keyword array in order to count earch word,
  7. Sorting and delivering result.

Algorithm : Javascript

/////////////////////////////////////////////////////////////////////////////
//
// The following code snippet works on the text from document.f.text.value
// Within this JavaScript solution, there is no sorting!
//
/////////////////////////////////////////////////////////////////////////////

// init
var aStopWords = new Array ("about", "all", "alone", "also", "am", "and", "as", "at", "because", "before", "beside", "besides", "between", "but", "by", "etc", "for", "i", "of", "on", "other", "others", "so", "than", "that", "though", "to", "too", "trough", "until");


var aWords = new Array ();
var aKeywords = new Array ();

// the text
var sText = document.f.text.value;

// total character count
var iCharCount = sText.length;

// remove line breaks
sText = sText.replace(/\s/g, ' ');

// convert to lowercase
sText = sText.toLowerCase();

// remove peculiars
sText = sText.replace(/[^a-zA-Z0-9äöüß]/g, ' ');

// total word count
aWords = sText.split(" ");
iWordCount = aWords.length;
var iCharCountWithout = 0;

// count words
for (var x = 0; x < aWords.length; x++) {
    iCharCountWithout += aWords[x].length;
}

aWords = new Array();

// remove stop words
for (var m = 0; m < aStopWords.length; m++) {
    sText = sText.replace(' ' + aStopWords[m] + ' ', ' ');
}

// explode to array of words
aWords = sText.split(" ");

// every word
for (var x = 0; x < aWords.length; x++) {
    // trim the word
    var s = aWords[x].replace (/^\s+/, '').replace (/\s+$/, '');
   
    // if already in array
    if (aKeywords[s] != undefined) {
        // then increase count of this word
        aKeywords[s]++;
    }

    // if not counted yet
    else {
        if (s != '') {
            aKeywords[s] = 1;
        }
    }
}

// result
sAlert = "Found keywords:";

n = 1;
for (var sKey in aKeywords) {
    iNumber = aKeywords[sKey];
    fQuotient = Math.round(100 * (iNumber / iWordCount), 2);
    sAlert = sAlert + "\n" + iNumber;
    sAlert = sAlert + " times (" + fQuotient + " %): ";
    sAlert = sAlert + sKey;
    n++;
   
    // break the loop if more then 10 results
    if (n > 10)
        break;
}

// alerting result
alert(sAlert);

// end
*************************************************************************************



Algorithm : php


/////////////////////////////////////////////////////////////////////////////
//
// The following code snippet works on a text delivered via $_POST["text"]
//
/////////////////////////////////////////////////////////////////////////////

// init
$aStopWords = array ('about', 'all', 'alone', 'also', 'am', 'and', 'as', 'at', 'because', 'before', 'beside', 'besides', 'between', 'but', 'by', 'etc', 'for', 'i', 'of', 'on', 'other', 'others', 'so', 'than', 'that', 'though', 'to', 'too', 'trough', 'until');
$aWords = array ();
$aKeywords = array ();

// total character count
$iCharCount = strlen($_POST["text"]);

// remove line breaks
$sText = ereg_replace("[\r\t\n]", ' ', $_POST["text"]);

// decode UTF-8
$sText = utf8_decode($sText);

// convert to lowercase
$sText = strtolower($sText);

// remove peculiars
$sText = preg_replace('/[^a-z0-9äöüß&;#]/', ' ', $sText);

// total word count
$aWords = explode(" ", $sText);
$iWordCount = count($aWords);
$iCharCountWithout = 0;

// count words
for ($x = 0; $x < count($aWords); $x++) {
    $iCharCountWithout += strlen($aWords[$x]);
}

unset ($aWords);

// remove stop words
for ($m = 0; $m < count($aStopWords); $m++) {
    $sText = str_replace(' ' . $aStopWords[$m] . ' ', ' ', $sText);
}

// reduce spaces
$sText = preg_replace('/^\s*$/', ' ', $sText);

// explode to array of words
$aWords = explode(" ", $sText);

// every word
for ($x = 0; $x < count($aWords); $x++) {
    // if already in array
    if (isset ($aKeywords[$aWords[$x]])) {
        // then increase count of this word
        $aKeywords[$aWords[$x]]++;
    }

    // if not counted yet
    else {
        if (trim($aWords[$x]) != '') {
            $aKeywords[$aWords[$x]] = 1;
        }
    }
}

// sort
arsort($aKeywords);

// result
echo '<table>';
echo '<tr><th>Count</th>;
echo '<th>Percentage</th>;
echo '<th>Found keyword</th></tr>';

$x = 0;
while ($iNumber = current($aKeywords)) {
    $iNumber = intval($iNumber);
    $sKey = key($aKeywords);
    $fQuotient = number_format(round(100 * ($iNumber / $iWordCount), 2), 2);
    echo '<tr><td>' . $iNumber . ' </td>';
    echo '<td>' . $fQuotient . ' %</td>';
    echo '<td>' . $sKey . '</td></tr>';
    $x++;
    next($aKeywords);
}
echo '</table>';

// end

0 comments:

Post a Comment

Followers