TheDeveloperBlog.com

Home | Contact Us

C-Sharp | Java | Python | Swift | GO | WPF | Ruby | Scala | F# | JavaScript | SQL | PHP | Angular | HTML

C# Stopword Dictionary

This C# program removes common words called stopwords from strings. It uses a Dictionary.

Stopwords are useless in queries.

These words, such as "the", "because" and "how", are considered stopwords and can usually be safely ignored. Here we remove stopwords, also known as poison words, from strings in the C# language.

Requirements
    Unimportant words are removed.

Input:  I saw a cat and a horse.
Output: saw cat horse

Input:  Google searches the Internet
Output: Google searches Internet

Input:  Using an extra step to eliminate stopwords
Output: Using extra step eliminate stopwords

Example. First, to remove the strings we must have a way to instantly check if a word is a stopword. If you use Lists, the algorithm will be O(N squared), meaning you could have performance problems.

Lists

C# program that removes stopwords

using System;
using System.Collections.Generic;
using System.Text;

/// <summary>
/// Tool to remove unwanted words such as 'the' or 'a'.
/// </summary>
static class StopwordTool
{
    /// <summary>
    /// Words we want to remove.
    /// </summary>
    static Dictionary<string, bool> _stops = new Dictionary<string, bool>
    {
	{ "a", true },
	{ "about", true },
	{ "above", true },
	{ "across", true },
	{ "after", true },
	{ "afterwards", true },
	{ "again", true },
	{ "against", true },
	{ "all", true },
	{ "almost", true },
	{ "alone", true },
	{ "along", true },
	{ "already", true },
	{ "also", true },
	{ "although", true },
	{ "always", true },
	{ "am", true },
	{ "among", true },
	{ "amongst", true },
	{ "amount", true },
	{ "an", true },
	{ "and", true },
	{ "another", true },
	{ "any", true },
	{ "anyhow", true },
	{ "anyone", true },
	{ "anything", true },
	{ "anyway", true },
	{ "anywhere", true },
	{ "are", true },
	{ "around", true },
	{ "as", true },
	{ "at", true },
	{ "back", true },
	{ "be", true },
	{ "became", true },
	{ "because", true },
	{ "become", true },
	{ "becomes", true },
	{ "becoming", true },
	{ "been", true },
	{ "before", true },
	{ "beforehand", true },
	{ "behind", true },
	{ "being", true },
	{ "below", true },
	{ "beside", true },
	{ "besides", true },
	{ "between", true },
	{ "beyond", true },
	{ "bill", true },
	{ "both", true },
	{ "bottom", true },
	{ "but", true },
	{ "by", true },
	{ "call", true },
	{ "can", true },
	{ "cannot", true },
	{ "cant", true },
	{ "co", true },
	{ "computer", true },
	{ "con", true },
	{ "could", true },
	{ "couldnt", true },
	{ "cry", true },
	{ "de", true },
	{ "describe", true },
	{ "detail", true },
	{ "do", true },
	{ "done", true },
	{ "down", true },
	{ "due", true },
	{ "during", true },
	{ "each", true },
	{ "eg", true },
	{ "eight", true },
	{ "either", true },
	{ "eleven", true },
	{ "else", true },
	{ "elsewhere", true },
	{ "empty", true },
	{ "enough", true },
	{ "etc", true },
	{ "even", true },
	{ "ever", true },
	{ "every", true },
	{ "everyone", true },
	{ "everything", true },
	{ "everywhere", true },
	{ "except", true },
	{ "few", true },
	{ "fifteen", true },
	{ "fify", true },
	{ "fill", true },
	{ "find", true },
	{ "fire", true },
	{ "first", true },
	{ "five", true },
	{ "for", true },
	{ "former", true },
	{ "formerly", true },
	{ "forty", true },
	{ "found", true },
	{ "four", true },
	{ "from", true },
	{ "front", true },
	{ "full", true },
	{ "further", true },
	{ "get", true },
	{ "give", true },
	{ "go", true },
	{ "had", true },
	{ "has", true },
	{ "have", true },
	{ "he", true },
	{ "hence", true },
	{ "her", true },
	{ "here", true },
	{ "hereafter", true },
	{ "hereby", true },
	{ "herein", true },
	{ "hereupon", true },
	{ "hers", true },
	{ "herself", true },
	{ "him", true },
	{ "himself", true },
	{ "his", true },
	{ "how", true },
	{ "however", true },
	{ "hundred", true },
	{ "i", true },
	{ "ie", true },
	{ "if", true },
	{ "in", true },
	{ "inc", true },
	{ "indeed", true },
	{ "interest", true },
	{ "into", true },
	{ "is", true },
	{ "it", true },
	{ "its", true },
	{ "itself", true },
	{ "keep", true },
	{ "last", true },
	{ "latter", true },
	{ "latterly", true },
	{ "least", true },
	{ "less", true },
	{ "ltd", true },
	{ "made", true },
	{ "many", true },
	{ "may", true },
	{ "me", true },
	{ "meanwhile", true },
	{ "might", true },
	{ "mill", true },
	{ "mine", true },
	{ "more", true },
	{ "moreover", true },
	{ "most", true },
	{ "mostly", true },
	{ "move", true },
	{ "much", true },
	{ "must", true },
	{ "my", true },
	{ "myself", true },
	{ "name", true },
	{ "namely", true },
	{ "neither", true },
	{ "never", true },
	{ "nevertheless", true },
	{ "next", true },
	{ "nine", true },
	{ "no", true },
	{ "nobody", true },
	{ "none", true },
	{ "nor", true },
	{ "not", true },
	{ "nothing", true },
	{ "now", true },
	{ "nowhere", true },
	{ "of", true },
	{ "off", true },
	{ "often", true },
	{ "on", true },
	{ "once", true },
	{ "one", true },
	{ "only", true },
	{ "onto", true },
	{ "or", true },
	{ "other", true },
	{ "others", true },
	{ "otherwise", true },
	{ "our", true },
	{ "ours", true },
	{ "ourselves", true },
	{ "out", true },
	{ "over", true },
	{ "own", true },
	{ "part", true },
	{ "per", true },
	{ "perhaps", true },
	{ "please", true },
	{ "put", true },
	{ "rather", true },
	{ "re", true },
	{ "same", true },
	{ "see", true },
	{ "seem", true },
	{ "seemed", true },
	{ "seeming", true },
	{ "seems", true },
	{ "serious", true },
	{ "several", true },
	{ "she", true },
	{ "should", true },
	{ "show", true },
	{ "side", true },
	{ "since", true },
	{ "sincere", true },
	{ "six", true },
	{ "sixty", true },
	{ "so", true },
	{ "some", true },
	{ "somehow", true },
	{ "someone", true },
	{ "something", true },
	{ "sometime", true },
	{ "sometimes", true },
	{ "somewhere", true },
	{ "still", true },
	{ "such", true },
	{ "system", true },
	{ "take", true },
	{ "ten", true },
	{ "than", true },
	{ "that", true },
	{ "the", true },
	{ "their", true },
	{ "them", true },
	{ "themselves", true },
	{ "then", true },
	{ "thence", true },
	{ "there", true },
	{ "thereafter", true },
	{ "thereby", true },
	{ "therefore", true },
	{ "therein", true },
	{ "thereupon", true },
	{ "these", true },
	{ "they", true },
	{ "thick", true },
	{ "thin", true },
	{ "third", true },
	{ "this", true },
	{ "those", true },
	{ "though", true },
	{ "three", true },
	{ "through", true },
	{ "throughout", true },
	{ "thru", true },
	{ "thus", true },
	{ "to", true },
	{ "together", true },
	{ "too", true },
	{ "top", true },
	{ "toward", true },
	{ "towards", true },
	{ "twelve", true },
	{ "twenty", true },
	{ "two", true },
	{ "un", true },
	{ "under", true },
	{ "until", true },
	{ "up", true },
	{ "upon", true },
	{ "us", true },
	{ "very", true },
	{ "via", true },
	{ "was", true },
	{ "we", true },
	{ "well", true },
	{ "were", true },
	{ "what", true },
	{ "whatever", true },
	{ "when", true },
	{ "whence", true },
	{ "whenever", true },
	{ "where", true },
	{ "whereafter", true },
	{ "whereas", true },
	{ "whereby", true },
	{ "wherein", true },
	{ "whereupon", true },
	{ "wherever", true },
	{ "whether", true },
	{ "which", true },
	{ "while", true },
	{ "whither", true },
	{ "who", true },
	{ "whoever", true },
	{ "whole", true },
	{ "whom", true },
	{ "whose", true },
	{ "why", true },
	{ "will", true },
	{ "with", true },
	{ "within", true },
	{ "without", true },
	{ "would", true },
	{ "yet", true },
	{ "you", true },
	{ "your", true },
	{ "yours", true },
	{ "yourself", true },
	{ "yourselves", true }
    };

    /// <summary>
    /// Chars that separate words.
    /// </summary>
    static char[] _delimiters = new char[]
    {
	' ',
	',',
	';',
	'.'
    };

    /// <summary>
    /// Remove stopwords from string.
    /// </summary>
    public static string RemoveStopwords(string input)
    {
	// 1
	// Split parameter into words
	var words = input.Split(_delimiters,
	    StringSplitOptions.RemoveEmptyEntries);
	// 2
	// Allocate new dictionary to store found words
	var found = new Dictionary<string, bool>();
	// 3
	// Store results in this StringBuilder
	StringBuilder builder = new StringBuilder();
	// 4
	// Loop through all words
	foreach (string currentWord in words)
	{
	    // 5
	    // Convert to lowercase
	    string lowerWord = currentWord.ToLower();
	    // 6
	    // If this is a usable word, add it
	    if (!_stops.ContainsKey(lowerWord) &&
		!found.ContainsKey(lowerWord))
	    {
		builder.Append(currentWord).Append(' ');
		found.Add(lowerWord, true);
	    }
	}
	// 7
	// Return string with words removed
	return builder.ToString().Trim();
    }
}

class Program
{
    static void Main()
    {
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "I saw a cat and a horse"));
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "Google searches the Internet"));
	Console.WriteLine(StopwordTool.RemoveStopwords(
	    "Using an extra step"));
    }
}

Output

saw cat horse
Google searches Internet
Using extra step

What the method does is loop through each word in your input string, then check to see if the word is a stopword and if it has already been found, and if the word is acceptable, it appends it to a StringBuilder. This is the result.

StringBuilder

It declares a static Dictionary to store the stopwords. It adds the items using collection initializer syntax to the Dictionary. It is static because it doesn't need to save state. Having different instances would simply waste memory.

DictionaryStatic

It removes duplicate words from the strings as well, which further aids in preparing search queries. The implicit var keyword in RemoveStopwords helps make the syntax cleaner and easier to read, at no performance or functional loss.

Var

And: The split method is used in RemoveStopwords with cached delimiters, which enhance performance. It separates the string into words.

Split

Summary. We saw a method that will remove specific words, such as stopwords, from a string. It uses Dictionary and static fields to accomplish this. It avoids degenerate performance with large data sets.

Warning: Looping through all stopwords on each word being checks could become slow, particularly if your list of stopwords grew larger.


Related Links

Adjectives Ado Ai Android Angular Antonyms Apache Articles Asp Autocad Automata Aws Azure Basic Binary Bitcoin Blockchain C Cassandra Change Coa Computer Control Cpp Create Creating C-Sharp Cyber Daa Data Dbms Deletion Devops Difference Discrete Es6 Ethical Examples Features Firebase Flutter Fs Git Go Hbase History Hive Hiveql How Html Idioms Insertion Installing Ios Java Joomla Js Kafka Kali Laravel Logical Machine Matlab Matrix Mongodb Mysql One Opencv Oracle Ordering Os Pandas Php Pig Pl Postgresql Powershell Prepositions Program Python React Ruby Scala Selecting Selenium Sentence Seo Sharepoint Software Spellings Spotting Spring Sql Sqlite Sqoop Svn Swift Synonyms Talend Testng Types Uml Unity Vbnet Verbal Webdriver What Wpf