TheDeveloperBlog.com


C# Stopword Dictionary

Stopwords are useless in queries. These words, such as "the", "because" and "how", are considered stopwords and can usually be safely ignored. Here we remove stopwords, also known as poison words, from strings in the C# language.

Requirements
    Unimportant words are removed.

Input:  I saw a cat and a horse.
Output: saw cat horse

Input:  Google searches the Internet
Output: Google searches Internet

Input:  Using an extra step to eliminate stopwords
Output: Using extra step eliminate stopwords


Example. First, to remove the strings we must have a way to instantly check if a word is a stopword. If you use Lists, the algorithm will be O(N squared), meaning you could have performance problems.

Lists
C# program that removes stopwords

using System;
using System.Collections.Generic;
using System.Text;

/// <summary>
/// Tool to remove unwanted words such as 'the' or 'a'.
/// </summary>
static class StopwordTool
{
    /// <summary>
    /// Words we want to remove.
    /// </summary>
    static Dictionary<string, bool> _stops = new Dictionary<string, bool>
    {
	{ "a", true },
	{ "about", true },
	{ "above", true },
	{ "across", true },
	{ "after", true },
	{ "afterwards", true },
	{ "again", true },
	{ "against", true },
	{ "all", true },
	{ "almost", true },
	{ "alone", true },
	{ "along", true },
	{ "already", true },
	{ "also", true },
	{ "although", true },
	{ "always", true },
	{ "am", true },
	{ "among", true },
	{ "amongst", true },
	{ "amount", true },
	{ "an", true },
	{ "and", true },
	{ "another", true },
	{ "any", true },
	{ "anyhow", true },
	{ "anyone", true },
	{ "anything", true },
	{ "anyway", true },
	{ "anywhere", true },
	{ "are", true },
	{ "around", true },
	{ "as", true },
	{ "at", true },
	{ "back", true },
	{ "be", true },
	{ "became", true },
	{ "because", true },
	{ "become", true },
	{ "becomes", true },
	{ "becoming", true },
	{ "been", true },
	{ "before", true },
	{ "beforehand", true },
	{ "behind", true },
	{ "being", true },
	{ "below", true },
	{ "beside", true },
	{ "besides", true },
	{ "between", true },
	{ "beyond", true },
	{ "bill", true },
	{ "both", true },
	{ "bottom", true },
	{ "but", true },
	{ "by", true },
	{ "call", true },
	{ "can", true },
	{ "cannot", true },
	{ "cant", true },
	{ "co", true },
	{ "computer", true },
	{ "con", true },
	{ "could", true },
	{ "couldnt", true },
	{ "cry", true },
	{ "de", true },
	{ "describe", true },
	{ "detail", true },
	{ "do", true },
	{ "done", true },
	{ "down", true },
	{ "due", true },
	{ "during", true },
	{ "each", true },
	{ "eg", true },
	{ "eight", true },
	{ "either", true },
	{ "eleven", true },
	{ "else", true },
	{ "elsewhere", true },
	{ "empty", true },
	{ "enough", true },
	{ "etc", true },
	{ "even", true },
	{ "ever", true },
	{ "every", true },
	{ "everyone", true },
	{ "everything", true },
	{ "everywhere", true },
	{ "except", true },
	{ "few", true },
	{ "fifteen", true },
	{ "fify", true },
	{ "fill", true },
	{ "find", true },
	{ "fire", true },
	{ "first", true },
	{ "five", true },
	{ "for", true },
	{ "former", true },
	{ "formerly", true },
	{ "forty", true },
	{ "found", true },
	{ "four", true },
	{ "from", true },
	{ "front", true },
	{ "full", true },
	{ "further", true },
	{ "get", true },
	{ "give", true },
	{ "go", true },
	{ "had", true },
	{ "has", true },
	{ "have", true },
	{ "he", true },
	{ "hence", true },
	{ "her", true },
	{ "here", true },
	{ "hereafter", true },
	{ "hereby", true },
	{ "herein", true },
	{ "hereupon", true },
	{ "hers", true },
	{ "herself", true },
	{ "him", true },
	{ "himself", true },
	{ "his", true },
	{ "how", true },
	{ "however", true },
	{ "hundred", true },
	{ "i", true },
	{ "ie", true },
	{ "if", true },
	{ "in", true },
	{ "inc", true },
	{ "indeed", true },
	{ "interest", true },
	{ "into", true },
	{ "is", true },
	{ "it", true },
	{ "its", true },
	{ "itself", true },
	{ "keep", true },
	{ "last", true },
	{ "latter", true },
	{ "latterly", true },
	{ "least", true },
	{ "less", true },
	{ "ltd", true },
	{ "made", true },
	{ "many", true },
	{ "may", true },
	{ "me", true },
	{ "meanwhile", true },
	{ "might", true },
	{ "mill", true },
	{ "mine", true },
	{ "more", true },
	{ "moreover", true },
	{ "most", true },
	{ "mostly", true },
	{ "move", true },
	{ "much", true },
	{ "must", true },
	{ "my", true },
	{ "myself", true },
	{ "name", true },
	{ "namely", true },
	{ "neither", true },
	{ "never", true },
	{ "nevertheless", true },
	{ "next", true },
	{ "nine", true },
	{ "no", true },
	{ "nobody", true },
	{ "none", true },
	{ "nor", true },
	{ "not", true },
	{ "nothing", true },
	{ "now", true },
	{ "nowhere", true },
	{ "of", true },
	{ "off", true },
	{ "often", true },
	{ "on", true },
	{ "once", true },
	{ "one", true },
	{ "only", true },
	{ "onto", true },
	{ "or", true },
	{ "other", true },
	{ "others", true },
	{ "otherwise", true },
	{ "our", true },
	{ "ours", true },
	{ "ourselves", true },
	{ "out", true },
	{ "over", true },
	{ "own", true },
	{ "part", true },
	{ "per", true },
	{ "perhaps", true },
	{ "please", true },
	{ "put", true },
	{ "rather", true },
	{ "re", true },
	{ "same", true },
	{ "see", true },
	{ "seem", true },
	{ "seemed", true },
	{ "seeming", true },
	{ "seems", true },
	{ "serious", true },
	{ "several", true },
	{ "she", true },
	{ "should", true },
	{ "show", true },
	{ "side", true },
	{ "since", true },
	{ "sincere", true },
	{ "six", true },
	{ "sixty", true },
	{ "so", true },
	{ "some", true },
	{ "somehow", true },
	{ "someone", true },
	{ "something", true },
	{ "sometime", true },
	{ "sometimes", true },
	{ "somewhere", true },
	{ "still", true },
	{ "such", true },
	{ "system", true },
	{ "take", true },
	{ "ten", true },
	{ "than", true },
	{ "that", true },
	{ "the", true },
	{ "their", true },
	{ "them", true },
	{ "themselves", true },
	{ "then", true },
	{ "thence", true },
	{ "there", true },
	{ "thereafter", true },
	{ "thereby", true },
	{ "therefore", true },
	{ "therein", true },
	{ "thereupon", true },
	{ "these", true },
	{ "they", true },
	{ "thick", true },
	{ "thin", true },
	{ "third", true },
	{ "this", true },
	{ "those", true },
	{ "though", true },
	{ "three", true },
	{ "through", true },
	{ "throughout", true },
	{ "thru", true },
	{ "thus", true },
	{ "to", true },
	{ "together", true },
	{ "too", true },
	{ "top", true },
	{ "toward", true },
	{ "towards", true },
	{ "twelve", true },
	{ "twenty", true },
	{ "two", true },
	{ "un", true },
	{ "under", true },
	{ "until", true },
	{ "up", true },
	{ "upon", true },
	{ "us", true },
	{ "very", true },
	{ "via", true },
	{ "was", true },
	{ "we", true },
	{ "well", true },
	{ "were", true },
	{ "what", true },
	{ "whatever", true },
	{ "when", true },
	{ "whence", true },
	{ "whenever", true },
	{ "where", true },
	{ "whereafter", true },
	{ "whereas", true },
	{ "whereby", true },
	{ "wherein", true },
	{ "whereupon", true },
	{ "wherever", true },
	{ "whether", true },
	{ "which", true },
	{ "while", true },
	{ "whither", true },
	{ "who", true },
	{ "whoever", true },
	{ "whole", true },
	{ "whom", true },
	{ "whose", true },
	{ "why", true },
	{ "will", true },
	{ "with", true },
	{ "within", true },
	{ "without", true },
	{ "would", true },
	{ "yet", true },
	{ "you", true },
	{ "your", true },
	{ "yours", true },
	{ "yourself", true },
	{ "yourselves", true }
    };

    /// <summary>
    /// Chars that separate words.
    /// </summary>
    static char[] _delimiters = new char[]
    {
	' ',
	',',
	';',
	'.'
    };

    /// <summary>
    /// Remove stopwords from string.
    /// </summary>
    public static string RemoveStopwords(string input)
    {
	// 1
	// Split parameter into words
	var words = input.Split(_delimiters,
	    StringSplitOptions.RemoveEmptyEntries);
	// 2
	// Allocate new dictionary to store found words
	var found = new Dictionary<string, bool>();
	// 3
	// Store results in this StringBuilder
	StringBuilder builder = new StringBuilder();
	// 4
	// Loop through all words
	foreach (string currentWord in words)
	{
	    // 5
	    // Convert to lowercase
	    string lowerWord = currentWord.ToLower();
	    // 6
	    // If this is a usable word, add it
	    if (!_stops.ContainsKey(lowerWord) &&
		!found.ContainsKey(lowerWord))
	    {
		builder.Append(currentWord).Append(' ');
		found.Add(lowerWord, true);
	    }
	}
	// 7
	// Return string with words removed
	return builder.ToString().Trim();
    }
}

class Program
{
    static void Main()
    {
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "I saw a cat and a horse"));
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "Google searches the Internet"));
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "Using an extra step"));
    }
}

Output

saw cat horse
Google searches Internet
Using extra step

What the method does is loop through each word in your input string, then check to see if the word is a stopword and if it has already been found, and if the word is acceptable, it appends it to a StringBuilder. This is the result.

StringBuilder

It declares a static Dictionary to store the stopwords. It adds the items using collection initializer syntax to the Dictionary. It is static because it doesn't need to save state. Having different instances would simply waste memory.

DictionaryStatic

It removes duplicate words from the strings as well, which further aids in preparing search queries. The implicit var keyword in RemoveStopwords helps make the syntax cleaner and easier to read, at no performance or functional loss.

Var

And: The split method is used in RemoveStopwords with cached delimiters, which enhance performance. It separates the string into words.

Split

Summary. We saw a method that will remove specific words, such as stopwords, from a string. It uses Dictionary and static fields to accomplish this. It avoids degenerate performance with large data sets.

Warning: Looping through all stopwords on each word being checks could become slow, particularly if your list of stopwords grew larger.