You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
235 lines
7.7 KiB
Python
235 lines
7.7 KiB
Python
from datetime import datetime
|
|
from hashlib import sha256
|
|
from os import path
|
|
import re
|
|
import io
|
|
import hashlib
|
|
import base64
|
|
import json
|
|
|
|
class ExternalNER:
|
|
def __init__(self):
|
|
self.id = ''
|
|
self.uri = ''
|
|
|
|
@classmethod
|
|
def Init(cls, Id, Uri):
|
|
eNER = cls()
|
|
eNER.id = Id.lower()
|
|
eNER.uri = Uri.lower()
|
|
return eNER
|
|
|
|
@classmethod
|
|
def InitFromArray(cls, Array):
|
|
eNERs = []
|
|
|
|
for item in Array:
|
|
eNER = cls()
|
|
eNER.id = item
|
|
eNER.uri = item
|
|
eNERs.append(eNER)
|
|
|
|
return eNERs
|
|
|
|
def __iter__(self):
|
|
yield 'id', self.id
|
|
yield 'uri', self.uri
|
|
|
|
@property
|
|
def Dict(self):
|
|
return dict(self)
|
|
|
|
class AmbarTaggingRule:
|
|
def __init__(self):
|
|
self.field = ''
|
|
self.regex = ''
|
|
self.tags = []
|
|
self.name = ''
|
|
self.enabled = False
|
|
|
|
@classmethod
|
|
def Init(cls, TaggingRuleDictionary):
|
|
taggingRule = cls()
|
|
taggingRule.field = TaggingRuleDictionary['field']
|
|
taggingRule.regex = TaggingRuleDictionary['regex']
|
|
taggingRule.tags = TaggingRuleDictionary['tags']
|
|
taggingRule.enabled = TaggingRuleDictionary['enabled']
|
|
taggingRule.name = TaggingRuleDictionary['name']
|
|
return taggingRule
|
|
|
|
class AmbarLogRecord:
|
|
def __init__(self):
|
|
self.created_datetime = datetime.now()
|
|
self.indexed_datetime = datetime.now()
|
|
self.source_id = ''
|
|
self.type = ''
|
|
self.message = ''
|
|
|
|
@classmethod
|
|
def Init(cls, SourceId, Type, Message):
|
|
logRecord = cls()
|
|
logRecord.created_datetime = datetime.now()
|
|
logRecord.indexed_datetime = datetime.now()
|
|
logRecord.source_id = str(SourceId)
|
|
logRecord.type = str(Type)
|
|
logRecord.message = str(Message)
|
|
return logRecord
|
|
|
|
def __iter__(self):
|
|
yield 'created_datetime', self.created_datetime.strftime(
|
|
'%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
yield 'indexed_datetime', self.indexed_datetime.strftime(
|
|
'%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
yield 'source_id', self.source_id
|
|
yield 'type', self.type
|
|
yield 'message', self.message
|
|
|
|
@property
|
|
def Dict(self):
|
|
return dict(self)
|
|
|
|
class AmbarFileContent:
|
|
def __init__(self):
|
|
self.processed_datetime = ''
|
|
self.size = 0
|
|
self.state = 'new'
|
|
self.title = ''
|
|
self.language = ''
|
|
self.type = ''
|
|
self.author = ''
|
|
self.length = ''
|
|
self.text = ''
|
|
self.thumb_available = False
|
|
self.ocr_performed = False
|
|
|
|
@classmethod
|
|
def Init(cls, ParserResponse, FileSize):
|
|
fileContent = cls()
|
|
fileContent.processed_datetime = datetime.now().strftime(
|
|
'%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
fileContent.size = FileSize
|
|
fileContent.state = 'processed'
|
|
fileContent.title = ParserResponse.meta[
|
|
'title'] if 'title' in ParserResponse.meta else ''
|
|
fileContent.language = ParserResponse.meta[
|
|
'language'] if 'language' in ParserResponse.meta else ''
|
|
fileContent.type = ParserResponse.meta[
|
|
'Content-Type'] if 'Content-Type' in ParserResponse.meta else ''
|
|
fileContent.author = ParserResponse.meta[
|
|
'Author'] if 'Author' in ParserResponse.meta else ''
|
|
fileContent.length = len(ParserResponse.text)
|
|
fileContent.text = ParserResponse.text
|
|
fileContent.ocr_performed = ParserResponse.ocrPerformed
|
|
## non serializable content
|
|
fileContent.initialized = True
|
|
fileContent.message = 'ok'
|
|
return fileContent
|
|
|
|
def __iter__(self):
|
|
yield 'processed_datetime', self.processed_datetime
|
|
yield 'size', self.size
|
|
yield 'state', self.state
|
|
yield 'title', self.title
|
|
yield 'language', self.language
|
|
yield 'type', self.type
|
|
yield 'author', self.author
|
|
yield 'length', self.length
|
|
yield 'text', self.text
|
|
yield 'thumb_available', self.thumb_available
|
|
yield 'ocr_performed', self.ocr_performed
|
|
|
|
@property
|
|
def Dict(self):
|
|
return dict(self)
|
|
|
|
class AmbarFileMeta:
|
|
def __init__(self):
|
|
self.id = ''
|
|
self.full_name = ''
|
|
self.full_name_parts = []
|
|
self.short_name = ''
|
|
self.extension = ''
|
|
self.extra = []
|
|
self.source_id = ''
|
|
self.created_datetime = ''
|
|
self.updated_datetime = ''
|
|
## non serializable content
|
|
self.initialized = False
|
|
self.message = ''
|
|
|
|
@classmethod
|
|
def ParseFullNameIntoParts(cls, FullName):
|
|
fullNameParts = []
|
|
for match in re.finditer(r'/', FullName):
|
|
if match.start() > 1:
|
|
fullNameParts.append(FullName[:match.start() + 1])
|
|
fullNameParts.append(FullName)
|
|
return fullNameParts
|
|
|
|
@classmethod
|
|
def Init(cls, MetaDict):
|
|
amFileMeta = cls()
|
|
try:
|
|
amFileMeta.full_name = MetaDict['full_name']
|
|
amFileMeta.full_name_parts = AmbarFileMeta.ParseFullNameIntoParts(MetaDict['full_name'])
|
|
amFileMeta.short_name = MetaDict['short_name']
|
|
amFileMeta.extension = MetaDict['extension']
|
|
amFileMeta.extra = MetaDict['extra']
|
|
amFileMeta.source_id = MetaDict['source_id']
|
|
amFileMeta.created_datetime = MetaDict['created_datetime']
|
|
amFileMeta.updated_datetime = MetaDict['updated_datetime']
|
|
amFileMeta.id = sha256('{0}{1}{2}{3}'.format(MetaDict['source_id'],MetaDict['full_name'],MetaDict['created_datetime'],MetaDict['updated_datetime']).encode('utf-8')).hexdigest()
|
|
## non serializable content
|
|
amFileMeta.initialized = True
|
|
amFileMeta.message = 'ok'
|
|
except Exception as ex:
|
|
amFileMeta.initialized = False
|
|
amFileMeta.message = str(ex)
|
|
return amFileMeta
|
|
|
|
@classmethod
|
|
def InitWithoutId(cls, CreateTime, UpdateTime, ShortName, FullName,
|
|
AmbarCrawlerId, Extra = []):
|
|
amFileMeta = cls()
|
|
try:
|
|
amFileMeta.full_name = FullName
|
|
amFileMeta.full_name_parts = AmbarFileMeta.ParseFullNameIntoParts(FullName)
|
|
amFileMeta.short_name = ShortName
|
|
amFileMeta.extension = path.splitext(ShortName)[1] if path.splitext(ShortName)[1] != '' else path.splitext(ShortName)[0]
|
|
amFileMeta.extra = Extra
|
|
amFileMeta.source_id = AmbarCrawlerId
|
|
if type(CreateTime) is str:
|
|
amFileMeta.created_datetime = CreateTime
|
|
else:
|
|
amFileMeta.created_datetime = CreateTime.strftime(
|
|
'%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
if type(UpdateTime) is str:
|
|
amFileMeta.updated_datetime = UpdateTime
|
|
else:
|
|
amFileMeta.updated_datetime = UpdateTime.strftime(
|
|
'%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
## non serializable content
|
|
amFileMeta.initialized = True
|
|
amFileMeta.message = 'ok'
|
|
except Exception as ex:
|
|
amFileMeta.initialized = False
|
|
amFileMeta.message = str(ex)
|
|
return amFileMeta
|
|
|
|
def __iter__(self):
|
|
yield 'id', self.id
|
|
yield 'full_name', self.full_name
|
|
yield 'full_name_parts', self.full_name_parts
|
|
yield 'short_name', self.short_name
|
|
yield 'extension', self.extension
|
|
extraArr = []
|
|
for extra in self.extra:
|
|
extraArr.append(dict(extra))
|
|
yield 'extra', extraArr
|
|
yield 'source_id', self.source_id
|
|
yield 'created_datetime', self.created_datetime
|
|
yield 'updated_datetime', self.updated_datetime
|
|
|
|
@property
|
|
def Dict(self):
|
|
return dict(self) |