.NET
C# text file deduping based on trimmed lines
A while ago, I needed to analyze a bunch of files based on the unique trimmed lines in them.
I based my code on the C# Tee filter and the StackOverflow example of C# deduping based on split.
It is a bit more extensive than strictly needed, as it has a few more commandline arguments that come in handy when processing files on the console:
DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output
Lines are terminated by CRLF sequences
C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),
DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]
Example:
DeDupe --trim file0.txt file1.txt
Dedupes the appended content of file0.txt and file1.txt into standard output
-t | --trim Will trim the lines before considering duplicates
-f | --flush Flushes files every CRLF
(setting is per tee instance)
-i | --ignore Ignore cancel Ctrl+C keypress: see UnixUtils tee
-l | --literal Stop recognizing flags, force all following filenames literally
-? | --h | /? | --help Displays this message and immediately quits
Duplicate filenames are quietly ignored.
If no input filenames are specified, then standard input is used
Press Ctrl+Z (End of File character) then Enter to abort.
Here is the source code:
using System;
using System.IO;
using System.Collections.Generic;
namespace DeDupe
{
class Program
{
static void help()
{
Console.Error.WriteLine("DeDupe - Dedupes a file into unique lines (only the first occurance of a line is kept) standard output");
Console.Error.WriteLine("Lines are terminated by CRLF sequences");
Console.Error.WriteLine("C# implementation januari 5th, 2012 by Jeroen Wiert Pluimers (http://wiert.wordpress.com),");
Console.Error.WriteLine("");
Console.Error.WriteLine("DeDupe [-i | --ignore] [-t | --trim] [-f | --flush] [-l | --literal] [-? | --h | --help | /?] [file0] [...]");
Console.Error.WriteLine(" Example:");
Console.Error.WriteLine(" DeDupe --trim file0.txt file1.txt");
Console.Error.WriteLine(" Dedupes the appended content of file0.txt and file1.txt into standard output");
Console.Error.WriteLine("");
Console.Error.WriteLine("-t | --trim Will trim the lines before considering duplicates");
Console.Error.WriteLine("-f | --flush Flushes files every CRLF");
Console.Error.WriteLine(" (setting is per tee instance)");
Console.Error.WriteLine("-i | --ignore Ignore cancel Ctrl+C keypress: see UnixUtils tee");
Console.Error.WriteLine("-l | --literal Stop recognizing flags, force all following filenames literally");
Console.Error.WriteLine("-? | --h | /? | --help Displays this message and immediately quits");
Console.Error.WriteLine("");
Console.Error.WriteLine("Duplicate filenames are quietly ignored.");
Console.Error.WriteLine("If no input filenames are specified, then standard input is used");
Console.Error.WriteLine("Press Ctrl+Z (End of File character) then Enter to abort.");
}
static void OnCancelKeyPressed(Object sender, ConsoleCancelEventArgs args)
{
// Set the Cancel property to true to prevent the process from
// terminating.
args.Cancel = true;
}
static List<String> filenames = new List<String>();
static void addFilename(string value)
{
if (-1 == filenames.IndexOf(value))
filenames.Add(value);
}
static bool trimLines = false;
static bool flushFiles = false;
static bool stopInterpretingFlags = false;
static bool ignoreCtrlC = false;
static int Main(string[] args)
{
try
{
foreach (string arg in args)
{
//Since we're already parsing.... might as well check for flags:
if (stopInterpretingFlags) //Stop interpreting flags, assume is filename
{
addFilename(arg);
}
else if (arg.Equals("/?") || arg.Equals("-?") || arg.Equals("-h") || arg.Equals("--help"))
{
help();
return 1; //Quit immediately
}
else if (arg.Equals("-t") || arg.Equals("--trim"))
{
trimLines = true;
}
else if (arg.Equals("-f") || arg.Equals("--flush"))
{
flushFiles = true;
}
else if (arg.Equals("-i") || arg.Equals("--ignore"))
{
ignoreCtrlC = true;
}
else if (arg.Equals("-l") || arg.Equals("--literal"))
{
stopInterpretingFlags = true;
}
else
{ //If it isn't any of the above, it's a filename
addFilename(arg);
}
//Add more flags as necessary, just remember to SKIP adding them to the file processing stream!
}
if (ignoreCtrlC) //Implement the Ctrl+C fix selectively (mirror UnixUtils tee behavior)
Console.CancelKeyPress += OnCancelKeyPressed;
HashSet<string> keys = new HashSet<string>();
Int64 index = 0;
using (StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()))
{
if (filenames.Count == 0)
using (StreamReader reader = new StreamReader(Console.OpenStandardInput()))
{
processInputFileReader(keys, writer, reader, ref index);
}
else
foreach (String filename in filenames)
{
using (StreamReader reader = new StreamReader(filename))
{
processInputFileReader(keys, writer, reader, ref index);
}
}
writer.Flush();
}
}
catch (Exception ex)
{
Console.Error.WriteLine(String.Concat("DeDupe: ", ex.Message)); // Send error messages to stderr
}
return 0;
}
private static void processInputFileReader(HashSet<string> keys, StreamWriter writer, StreamReader reader, ref Int64 index)
{
string line = readLine(reader);
while (!string.IsNullOrEmpty(line))
{
string candidate = line;
if (keys.Add(candidate))
{
writer.WriteLine(line);
index += line.Length + Environment.NewLine.Length;
if (flushFiles)
writer.Flush();
}
line = readLine(reader);
}
}
private static string readLine(StreamReader reader)
{
string line = reader.ReadLine();
if (null != line)
if (trimLines)
line = line.Trim();
return line;
}
}
}
Reference: C# text file deduping based on trimmed lines (via: Stack Overflow) from our NCG partner Jeroen Pluimers at the The Wiert Corner blog.
Do you want to know how to develop your skillset to become a sysadmin Rockstar?
Subscribe to our newsletter to start Rocking right now!
To get you started we give you our best selling eBooks for FREE!
1. Introduction to NGINX
2. Apache HTTP Server Cookbook
3. VirtualBox Essentials
4. Nagios Monitoring Cookbook
5. Linux BASH Programming Cookbook
6. Postgresql Database Tutorial
and many more ....
I agree to the Terms and Privacy Policy




