Click here to Skip to main content
15,501,490 members
Please Sign up or sign in to vote.
1.00/5 (2 votes)
See more:
C++
/* Program for Bad Character Heuristic of Boyer Moore String Matching Algorithm */ 
# include <limits.h>
# include <string.h>
# include <vector>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <omp.h>
 
# define NO_OF_CHARS 256

using namespace std;

// An unsigned char can store 1 Bytes (8bits) of data (0-255)
typedef unsigned char BYTE;


// A utility function to get maximum of two integers
int max (int a, int b) { return (a > b)? a: b; }



// The preprocessing function for Boyer Moore's bad character heuristic
void badCharHeuristic( char *str, int size, int badchar[NO_OF_CHARS])
{
    int i;
 
    // Initialize all occurrences as -1
    for (i = 0; i < NO_OF_CHARS; i++)
         badchar[i] = -1;
 
    // Fill the actual value of last occurrence of a character
    for (i = 0; i < size; i++)
         badchar[(int) str[i]] = i;
}
 
/* A pattern searching function that uses Bad Character Heuristic of
   Boyer Moore Algorithm */
void search( char *txt,  string *Keywords,long long Pos,int &KeyWordCount,vector< vector<long long> > &Offsets)
{   	
	char *pat;
	
	for (int i=0;i<KeyWordCount;i++)
	{	
	    pat=(char*)Keywords[i].c_str();
	    int m = strlen(pat);
	    int n = strlen(txt);
	    //cout<<"\n"<<pat<<"\n";		
	    
	        
	 
	    int badchar[NO_OF_CHARS];
	 
	    /* Fill the bad character array by calling the preprocessing
	       function badCharHeuristic() for given pattern */
	    badCharHeuristic(pat, m, badchar);
	 
	    int s = 0;  // s is shift of the pattern with respect to text
	    while(s <= (n - m))
	    {
		int j = m-1;
	 
		/* Keep reducing index j of pattern while characters of
		   pattern and text are matching at this shift s */
		while(j >= 0 && pat[j] == txt[s+j])
		    j--;
	 
		/* If the pattern is present at current shift, then index j
		   will become -1 after the above loop */
		if (j < 0)
		{    		
			//printf("\nFound @ Offset %d", Pos+s);
						
			if(!Offsets[i].empty()) 			//if Any Offset Exist For Keyword
				if(Offsets[i].back()==(Pos+s)) 	//Check it with last offset for duplication			
					Offsets[i].pop_back();		//If duplicates, pop out		
			
			Offsets[i].push_back((long long)(Pos+s)); //Push the offset
			//cout<<"\n '"<<pat<<"' Found @ Offset:"<<(Pos+s);	
		    	
	 
		    /* Shift the pattern so that the next character in text
		       aligns with the last occurrence of it in pattern.
		       The condition s+m < n is necessary for the case when
		       pattern occurs at the end of text */
		    s += (s+m < n)? m-badchar[txt[s+m]] : 1;
	 
		}
	 
		else
		    /* Shift the pattern so that the bad character in text
		       aligns with the last occurrence of it in pattern. The
		       max function is used to make sure that we get a positive
		       shift. We may get a negative shift if the last occurrence
		       of bad character in pattern is on the right side of the
		       current character. */
		    s += max(1, j - badchar[txt[s+j]]);
	    }
	}
    
   
}


long long FileSize(const char* sFileName)
{
  std::ifstream f;
  f.open(sFileName, std::ios_base::binary | std::ios_base::in);
  if (!f.good() || f.eof() || !f.is_open()) { return 0; }
  f.seekg(0, std::ios_base::beg);
  std::ifstream::pos_type begin_pos = f.tellg();
  f.seekg(0, std::ios_base::end);
  return static_cast<long long>(f.tellg() - begin_pos);
}

/*	The Program read the File and Search For Multiple Keyword.
	During Searching the occurrence of each Keyword is Stored in a Seperate Vector
	The Program Currently Displays the Hit Count of each Keyword using its Vector Size
*/
int main()
{	
	
	/*---------------------------------*/
	/* Arguments to the Function, these values need to be updated */
	
	//Give the File Location 
	const char *filePath = "xyz.raw";	// 
	int KeyWordCount=2;
	//Define the Searching Keywords;
	string *Keywords=new string[KeyWordCount];
	Keywords[2]="gmail";
	Keywords[0]="facebook";
	int MaxKeyLen=0;
	
	
	
	//Define a Vector For Storing Offsets of Keyword occurrence  
	vector< vector<long long> > Offsets;
	for(int i=0;i<KeyWordCount;i++)
	{	Offsets.push_back(vector<long long>()); //Initialise Vector
		if(Keywords[i].size()> MaxKeyLen)
		   MaxKeyLen=Keywords[i].size();
	}
	
	cout<<"Max Length Of Keyword: "<<MaxKeyLen<<"\n";
	/* --------------------------------------------*/

	
	FILE *file = NULL;			// File pointer
	int ChunkSize=512; 		// Declare the Blocks Size ie read from File
					// Current File Pointer

	// Open the file in binary mode using the "rb" format string
	// This also checks if the file exists and/or can be opened for reading correctly
	if ((file = fopen(filePath, "rb")) == NULL)
		cout << "Could not open specified file" << endl;
	else
		cout << "File opened successfully" << endl;

	//off_t fileSize = getfilesize(file);
	long long fileSize = FileSize(filePath);
	cout<<"FileSize:"<<fileSize<<endl;
	
	

	cout<<"Searching :\n";
        //Process the File as Blocks
	int Currlevel;
	int w;

	#pragma omp parallel for
	for(int Pos=0;Pos<=fileSize; Pos+=ChunkSize-MaxKeyLen)
	{	
				
									//Advance the pointer , here the MaxKeyLen is Subtracted
				//if(Pos!=0)					//to skip keyword Skipping duringreading
				
				int x; // a variable local or private to each thread
				char *fileBuf;
				// Allocate space in the buffer for the Specified Block Size
				try
				{
					fileBuf = new char[ChunkSize];
				}
				catch(...)
				{
					cout<<"Memory allocation exception"<<endl;
					//return 0;
				}
			    	x = omp_get_thread_num();
				cout<<"Thread="<<x<<"Pos:"<<Pos<<"\n";
				fread(fileBuf, ChunkSize, 1, file); 		//Read the File into Buffer
				fileBuf[ChunkSize]='\0';			//put a String Terminator For Search
				
				search(fileBuf,Keywords,Pos,KeyWordCount,Offsets);	//Record the Hit
				delete[]fileBuf;					//delete the Buffer
				w++;
				fseek(file,Pos,SEEK_SET);			//Seek the file pointer
				//std::cin.get();
		     		
		
	}
	//#pragma omp barrier

	//cout<<"\r"<<flush<<"Completed"; //Refresh the Screen
	
	/* Print the Search Result */
	cout<<"\n\t***Keyword Result*** \n \tKeyword :\t Count";
	for(int i=0;i<KeyWordCount;i++)
		cout<<"\n\t"<<Keywords[i]<<" :\t "<<Offsets[i].size()<<""; //Print Each Keyword and Its Seach Hit Count using Size of its Vector
	
	cout<<"\n i="<<w;
	/* --------------------------------------------*/

	
	delete []Keywords;	//delete the Buffer
	Offsets.clear();
        fclose(file);		//Close the file
	return 0;
}
Posted
Updated 24-Mar-14 9:04am
v2
Comments
OriginalGriff 24-Mar-14 12:42pm     CRLF
Help you with what? All you have is a wish "I want to search multiple strings or string, in very large file(Gb size raw file) using Boyer moore Algm" and a code dump. We need to know what part is giving you problems, what those problems are, and whay help you need. Use the "Improve question" widget to edit your question and provide better information.
Rahul Mohanan 24-Mar-14 14:14pm    
I'm sorry, my English not good..! actually i want to search for keywords(eg. gmail,facebook) in large .raw or .DMP file
OriginalGriff 24-Mar-14 15:00pm     CRLF
I can understand that - but we can't run your application and get the same results because we don't have access to your data! So, pretty much you have to do the work: start with the debugger and find out what does work, how far it gets, where it fails. All of this starts to tell you where the problem is, and then you can work out what causes it. We can't do that! :laugh:
Mattias Högström 24-Mar-14 17:40pm     CRLF
int KeyWordCount=2; //Define the Searching Keywords; string *Keywords=new string[KeyWordCount]; Keywords[2]="gmail"; Keywords[0]="facebook"; The indexes look bad. The other people are right, without the datafile it is hard to know where it crashes. Crashes are really easy to find in a debugger. If you have memory overruns or underruns, try running Valgrind on the application.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900