codesearch/createIndex.py at master · absagar/codesearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import codecs
from os import walk, mkdir
import os.path
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.query import Term
from django.conf import settings

basePath = settings.BASE_PATH
targetPath = settings.INDEX_DIR+"/woosterapp"
ext_list = settings.EXT_LIST
filterFileType = "hpp"

def start(mypath, writer):
    for (dirpath, dirnames, filenames) in walk(mypath):
        for f in filenames:
            ext = f.split(".")[-1]
            if (ext in ext_list):
                completePath = os.path.join(dirpath, f)
                print f,dirpath
                with codecs.open(completePath, encoding='utf-8', errors='ignore') as content_file:
                    cont = content_file.read()
                    writer.add_document(path=unicode(completePath), filetype=unicode(ext),
                                        uipath=unicode(completePath[len(basePath)+1:]), content=cont)
    writer.commit()


schema = Schema(path=ID(stored=True),filetype=TEXT(stored=True), uipath=TEXT, content=TEXT)
#if not os.path.exists("indexdir"):
#    os.mkdir("indexdir")

ix = create_in(targetPath, schema)
writer = ix.writer()
start(basePath, writer)


#ix = open_dir("../whooshter_external_files/indexdir",schema=schema)
with ix.searcher() as searcher:
    #search file content
    query = QueryParser("content", ix.schema).parse(u"adobe")
    filter_q = Term("filetype", filterFileType)
    results = searcher.search(query,  limit=None)
    print results.scored_length(), len(results)
    for result in results:
        print result['filetype']
        print type(result)
        dn = result.docnum
    print results[dn]

reader = ix.reader()
#         with codecs.open(result["path"], encoding='utf-8') as fileobj:
#             filecontents = fileobj.read()
#         print(result.highlights("content", text=filecontents))

    #search file name
#     query1 = QueryParser("uipath", ix.schema).parse(u"ftw")
#     results1 = searcher.search(query1)
#     print len(results1)
#     print results1[0]
#     print results1[0]['path']
    # Use this for paged searching
    #s.search_page(q, 5, pagelen=20)