You are on page 1of 11

BOOLEAN RETRIEVAL SYSTEM

Information Retrieval Assignment


avg598@gmail.com
Abhishek V. Ghogare
BT10CSE024


Introduction

This boolean retrieval system is implemented for local file search.
C# programming language is used for implementation.
An open source library iTextSharp is used for reading PDF file contents.








Usage:
Specify a folder as a search domain for BRS from BRS Tools > Set Search Domain.
It will generate cache for that folder or skip if already generated.














Now type your query in search box only using !, & and | as operators for tokens.
Click search to get list of files which satisfy given query in result box.
Select file to open and click Open.





Implementation Details:
Programming Language : C#
External Libraries : iTextSharp
Data Structure for list of tokens : Dynamic Array


o When a user specify the search domain folder, the program makes a list of all PDF files in that
directory and all child directories.
o A text is extracted from each file page wise using iTextSharp and this text is converted to token
list.
o All generated tokens are insert to universal token list.
o If token is already present in list then bit is set to 1 for corresponding file in token bits else new
token is inserted in list along with corresponding bit.
o The files corresponding to each token in token list is updated with 8 bits after processing 8 files.


o To execute query, query is parsed and every token in query is attributed with two Boolean
variables : isComplement and isAnd, i.e. is token associated with ! and with &.
o Result byte is initialized with 0x00.
o Now result is constructed as follows:
I. Result byte is initialized with first byte in file corresponding to first token in query.
II. Select next token in query and fetch byte from file corresponding to that token.
III. Perform bitwise NOT operation on that byte if value of isComplement is true.
IV. Perform bitwise AND operation with the result byte if isAnd is true else perform bitwise
OR operation.
V. Repeat from ll. step for all tokens in the query.
VI. Insert all files corresponding to 1 bit in result byte.
VII. Repeat from l. step for all bytes in file corresponding to first token (All token files
contain same number of bytes as number of searchable files i.e. files in corpus is
constant for all tokens).

The BRS class
class BRS
{
String _path_to_search_domain;
int _number_of_files;
public BRS()
{
_path_to_search_domain = "";
_number_of_files = 0;
}
public void SetSearchDomain(String path)
{
_path_to_search_domain = path;
}
public void GenSysCache();
public String[] Search(String query);
};

BRS class implementation

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

using System.IO;
using System.Collections;
using System.Windows.Forms;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text.RegularExpressions;

namespace Boolean_Retrieval_System
{
class BRS
{
String _path_to_search_domain;
int _number_of_files;
public BRS()
{
_path_to_search_domain = "";
_number_of_files = 0;
}
public void SetSearchDomain(String path)
{
_path_to_search_domain = path;
}
public void GenSysCache()
{
if (_path_to_search_domain=="" || !Directory.Exists(_path_to_search_domain))
{
MessageBox.Show("Search domain not specified or does not exist!",
"Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
return;
}


if (!Directory.Exists(_path_to_search_domain + "\\__BRS"))
{
Directory.CreateDirectory(_path_to_search_domain + "\\__BRS");
}
if (!Directory.Exists(_path_to_search_domain + "\\__BRS\\tokens"))
{
Directory.CreateDirectory(_path_to_search_domain + "\\__BRS\\tokens");
}
else
{
// Only updating number of files.
IEnumerable<String> l = File.ReadLines(_path_to_search_domain +
"\\__BRS\\filelist.brs");
_number_of_files = l.Count();
return;
// returning as no need to cache again.
}
_number_of_files = 0;
List<String> tokens = new List<String>();
List<byte> bits = new List<byte>();
List<FileStream> fileStreams = new List<FileStream>();
Regex regex = new Regex("^[a-zA-Z0-9_]+$");


String[] files = Directory.GetFiles(_path_to_search_domain, "*.pdf",
SearchOption.AllDirectories);
StreamWriter filelist = new StreamWriter(_path_to_search_domain +
"\\__BRS\\filelist.brs");
char[] delimiters = { ' ', ',', '.', ':', '\t', '\n', '\r', '\\', '/', '\'',
'"', '(', ')', '-', '?' };
String temp = "";
foreach (String path in files)
{
filelist.WriteLine(path);
ITextExtractionStrategy its = new
iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
PdfReader pdfreader = new PdfReader(path);
for (int page = 1; page <= pdfreader.NumberOfPages; page++)
{
String text = PdfTextExtractor.GetTextFromPage(pdfreader, page);
text =
Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8,
Encoding.Default.GetBytes(text)));
foreach (String word in text.Split(delimiters))
{
temp = word.Trim().ToLower();
// rejecting word if contains other than a to z and digits.
if (!regex.IsMatch(temp)
|| temp == "con"
|| temp == "aux"
|| temp == "nul"
|| temp == "prn"
|| temp == "com1"
|| temp == "com2"
|| temp == "com3"
|| temp == "com4"
|| temp == "com5"
|| temp == "com6"
|| temp == "com7"
|| temp == "com8"
|| temp == "com9"
|| temp == "lpt1"
|| temp == "lpt2"
|| temp == "lpt3"
|| temp == "lpt4"
|| temp == "lpt5"
|| temp == "lpt6"
|| temp == "lpt7"
|| temp == "lpt8"
|| temp == "lpt9"
) continue;
int indexInTokens;
if ((indexInTokens = tokens.IndexOf(temp)) >= 0)
{
bits[indexInTokens] |= (byte)(1 << (7 - _number_of_files %
8));
}
else
{
tokens.Add(temp);
bits.Add((byte)(1 << 7 - _number_of_files % 8));
FileStream fs = File.Create(_path_to_search_domain +
"\\__BRS\\tokens\\" + temp + ".tkn");
fileStreams.Add(fs);
// inserting empty bytes to file corresponding to previous
processed files.
for (int i = 1; i < _number_of_files / 8; i++)
{
fs.WriteByte(0);
}
}
}
}
_number_of_files++;
if (_number_of_files % 8 == 0)
{
for (int i = 0; i < tokens.Count; i++)
{
fileStreams[i].WriteByte(bits[i]);
bits[i] = 0;
}
}
}
// Appending last byte
for (int i = 0; i < tokens.Count; i++)
{
fileStreams[i].WriteByte(bits[i]);
fileStreams[i].Close();
}

filelist.Close();
}
public String[] Search(String query)
{
List<String> result = new List<string>(),
tokens = new List<string>();
List<FileStream> tokenFS = new List<FileStream>();
List<bool> isComplement = new List<bool>();
List<bool> isAnd = new List<bool>();
isAnd.Add(false); // First token must be ORed.

// First populating tokens from query.
query = query.ToLower();
String[] q = query.Split(" ".ToCharArray());
Regex regex = new Regex("[a-zA-Z0-9_]+");
for(int i=0 ; i<q.Length ; i++)
{
if (q[i] == "&")
{
isAnd.Add(true);
i++;
}
else if (q[i] == "|")
{
isAnd.Add(false);
i++;
}

if (q[i] == "!")
{
isComplement.Add(true);
i++;
}
else
{
isComplement.Add(false);
}

if(q[i]!="!" && q[i] != "&" && q[i] != "|")
{
if (regex.IsMatch(q[i]))
{
tokens.Add(q[i]);
if (File.Exists(_path_to_search_domain + "\\__BRS\\tokens\\" +
q[i] + ".tkn"))
{
try
{
tokenFS.Add(new FileStream(_path_to_search_domain +
"\\__BRS\\tokens\\" + q[i] + ".tkn", FileMode.Open));
}
catch { tokenFS.Add(null); }
}
else
{
tokenFS.Add(null);
}
}
}

}
// Now searching
for(int numOfFiles=0 ; numOfFiles<_number_of_files ; numOfFiles+=8)
{
byte value = 0x00;
for (int tk = 0; tk < tokens.Count; tk++)
{
if (tokenFS[tk]!=null)
{
byte tbyte = (byte)tokenFS[tk].ReadByte();
if(isComplement[tk])
tbyte=(byte)~tbyte;
if(isAnd[tk])
value&=tbyte;
else
value|=tbyte;
}
}
byte mask = 0x80;
for (int j = 0; j < 8; j++)
{
if ((value & (mask >> j)) != 0 && numOfFiles + j -
1<=_number_of_files)
{
result.Add(File.ReadLines(_path_to_search_domain +
"\\__BRS\\filelist.brs").Skip(numOfFiles + j-1).Take(1).First());
}
}
}
foreach (FileStream fs in tokenFS)
if(fs!=null)
fs.Close();
return result.ToArray();
}
}
}



Future Improvements

Support for all types of files.
Faster cache generation.
More sensible tokenization strategy.
More optimized storage use for cache.

You might also like