Your "store or process" is vague, but it implies to me that it would be acceptable to pre-process the file(s). In that case, the thought that came to me was to index the files so that the information can be efficiently retrieved.
Caveat: this is just thoughts, none of it has been tried!
So, this architecture might fit in memory...
Create a class to represent a sentence:
public class Sentence
{
public string FilePath { get; set; }
public long BeginOffset { get; set; }
public int Length { get; set; }
}
Then the class for all of this word lookup solution:
public class WordLookup
{
Dictionary<string, List<Sentence>> LookupMap = new Dictionary<string, List<Sentence>>();
public bool AddFileToIndex(string filePath)
{
using (StreamReader reader = File.OpenText(filePath))
{
Stream bs = reader.BaseStream;
if (!bs.CanSeek)
return false;
StringBuilder word = new StringBuilder();
int charcode;
Sentence sent = new Sentence() { FilePath = filePath,
BeginOffset = bs.Position };
bool inSentence = false;
bool inWord = false;
while (0 <= (charcode = reader.Read()))
{
char c = (char)charcode;
if (char.IsWhiteSpace(c))
{
if (!inSentence)
sent.BeginOffset = bs.Position;
if (!inWord)
continue;
}
if ()
{
if (inWord)
{
InternalSentencesForWord(word.ToString(), true).Add(sent);
word.Clear();
inWord = false;
}
if ()
{
long pos = bs.Position;
sent.Length = pos - sent.BeginOffset;
sent = new Sentence() { FilePath = filePath,
BeginOffset = pos};
inSentence = false;
}
}
else
{
inWord = inSentence = true;
word.Append(c);
}
}
if (inWord)
{
InternalSentencesForWord(word.ToString(), true).Add(sent);
}
if (inSentence)
{
long pos = bs.Position;
sent.Length = pos - sent.BeginOffset;
}
}
return true;
}
public ReadOnlyCollection<Sentence> SentencesForWord(string w)
{
return new ReadOnlyCollection<Sentence>(InternalSentencesForWord(w, false));
}
private List<Sentence> InternalSentencesForWord(string w, bool addToMap)
{
List<Sentence> sentences;
if (!LookupMap.TryGetValue(w, out sentences))
{
sentences = new List<Sentence>();
if (addToMap)
LookupMap.Add(w, sentences);
}
return sentences;
}
}
Use the
SentencesForWord(w)
method to get the collection of
Sentence
instances that will let you open the appropriate file, seek to the beginning of the sentence, read in only that sentence and then display it.
Obviously, you don't want to do this indexing each time you do a lookup, only when the file(s) change. Persisting this efficiently is left to you.
In fact, it would probably be even better to keep this information structure in a database, either local or on a server. That depends on your application, and how often your input file(s) change. Also, I didn't explore how to efficiently remove things from this index without rebuilding the whole thing from scratch.
Good luck.
=====
Edit MTH:
The indexing is actually much harder than I thought, because the
FileStream.Position
property (for disk files) returns the value in terms of internal buffer-fulls, not the actual byte position that corresponds with the next character to be read.
It's not
incredibly difficult if you can
guarantee only 1 byte characters. But this still depends on reflection and knowing the details of the actual implementation of
StreamReader
!!!!
First add:
class TrackedStreamReader : StreamReader
{
public TrackedStreamReader(string path)
: base(path)
{
BaseStreamReader = (StreamReader)this;
Type bst = typeof(StreamReader);
CharPosField = bst.GetField("charPos", BindingFlags.DeclaredOnly | BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
CharLenField = bst.GetField("charLen", BindingFlags.DeclaredOnly | BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic);
}
private StreamReader BaseStreamReader;
private FieldInfo CharPosField;
private FieldInfo CharLenField;
public long Position
{
get
{
int charPos = (int)CharPosField.GetValue(BaseStreamReader);
int charLen = (int)CharLenField.GetValue(BaseStreamReader);
return BaseStream.Position - charLen + charPos;
}
}
}
Then change the
AddFileToIndex
to use that:
public bool AddFileToIndex(string filePath)
{
using (TrackedStreamReader = new TrackedStreamReader(filePath))
{
if (!reader.BaseStream.CanSeek)
return false;
if (reader.EndOfStream)
return false;
StringBuilder word = new StringBuilder();
Sentence sent = new Sentence() {
FilePath = filePath,
BeginOffset = reader.Position
};
bool inSentence = false;
bool inWord = false;
HashSet<string> wordsThisSentence = new HashSet<string>();
while (!reader.EndOfStream)
{
char c = (char)reader.Read();
if (char.IsWhiteSpace(c))
{
if (!inSentence)
sent.BeginOffset = reader.Position;
if (!inWord)
continue;
}
if (EndsWord(c))
{
if (inWord)
{
string w = word.ToString();
if (!wordsThisSentence.Contains(w))
{
wordsThisSentence.Add(w);
InternalSentencesForWord(w, true).Add(sent);
}
word.Clear();
inWord = false;
}
if (EndsSentence(c))
{
long pos = reader.Position;
sent.Length = (int)(pos - sent.BeginOffset);
sent = new Sentence() {
FilePath = filePath,
BeginOffset = pos
};
wordsThisSentence.Clear();
inSentence = false;
}
}
else
{
inWord = inSentence = true;
word.Append(c);
}
}
if (inWord)
{
InternalSentencesForWord(word.ToString(), true).Add(sent);
}
if (inSentence)
{
long pos = reader.Position;
sent.Length = (int)(pos - sent.BeginOffset);
}
}
return true;
}
private bool EndsWord(char c)
{
return !char.IsLetterOrDigit(c);
}
private bool EndsSentence(char c)
{
return c == '.' || c == '?' || c == '!';
}
I'm still thinking through dealing with multiple byte characters!
(There's probably eventually an article in all of this...)
=====