Skip to content
Snippets Groups Projects
Commit 362ad620 authored by Jan Mach's avatar Jan Mach
Browse files

Improved data migration script.

The data migration script was enhanced for quick and little bit dirty fix of invalid string character encoding when converting records from MongoDB to new PostgreSQL representation. The old implementation contained a bug and some strings were not stored in correct UTF-8 encoding. After this patch everything should be UTF-8 in PostgreSQL tables. (Redmine issue: #3752)
parent 552acc91
No related branches found
No related tags found
No related merge requests found
......@@ -77,7 +77,7 @@ import mentat.storage
import mentat.services.sqlstorage
import mentat.stats.idea
from mentat.datatype.internal import User, AbuseGroup, EventStat, Report
from mentat.datatype.sqldb import MODEL, UserModel, GroupModel, NetworkModel,\
from mentat.datatype.sqldb import UserModel, GroupModel, NetworkModel,\
FilterModel, SettingsReportingModel, EventStatisticsModel, EventReportModel, \
usermodel_from_typeddict, groupmodel_from_typeddict, setrepmodel_from_typeddict, \
filtermodel_from_typeddict, networkmodel_from_typeddict, eventstatsmodel_from_typeddict
......@@ -85,6 +85,45 @@ from mentat.datatype.sqldb import MODEL, UserModel, GroupModel, NetworkModel,\
#-------------------------------------------------------------------------------
def encconv(val):
"""
Fix invalid encoding of czech characters between legacy MongoDB records and
new UTF-8 based storages.
"""
val = val.replace('á','á')\
.replace('Ã\x81', 'Á')\
.replace('é','é')\
.replace('Ã\x89','É')\
.replace('Ä\x9b','ě')\
.replace('Ä\x9A','Ě')\
.replace('Ã\xad','í')\
.replace('Ã\x8D','Í')\
.replace('ý','ý')\
.replace('Ã\x9d','Ý')\
.replace('ó','ó')\
.replace('Ã\x93','Ó')\
.replace('ö','ö')\
.replace('ů','ů')\
.replace('Å®','Ů')\
.replace('ú','ú')\
.replace('Ã\x9a','Ú')\
.replace('Ž','Ž')\
.replace('ž','ž')\
.replace('Å¡','š')\
.replace('Å\xa0','Š')\
.replace('Ä\x8d','č')\
.replace('Ä\x8c','Č')\
.replace('Å\x99','ř')\
.replace('Å\x98','Ř')\
.replace('Ä\x8f','ď')\
.replace('Ä\x8e','Ď')\
.replace('Å¥','ť')\
.replace('Ť','Ť')\
.replace('Å\x88','ň')\
.replace('Å\x87','Ň')\
.replace('Å\x84','ń')
return val
#
# Initialize and execute simple command line argument parser.
......@@ -152,6 +191,7 @@ MONGO_ITEMS = MONGOMANAGER.collection('db', 'col_users')
TOTAL_COUNT = MONGO_ITEMS.count()
print("* found total of {:,d} 'user' objects in MongoDB".format(TOTAL_COUNT))
ITEMCOUNTER = 0
for rawitem in MONGO_ITEMS.find().sort('_id', 1):
try:
mongoitem = User(rawitem)
......@@ -159,13 +199,18 @@ for rawitem in MONGO_ITEMS.find().sort('_id', 1):
print(mongoitem)
sqlitem = usermodel_from_typeddict(mongoitem)
sqlitem.fullname = encconv(sqlitem.fullname)
sqlitem.organization = encconv(sqlitem.organization)
#print("{} {}: {}".format(sqlitem.login, sqlitem.fullname, repr(sqlitem.organization)))
if ARGS.verbose:
print(sqlitem)
SQLSTORAGE.session.add(sqlitem)
#SQLSTORAGE.session.commit()
SQLSTORAGE.session.commit()
SQLUSERS[sqlitem.login] = sqlitem
ITEMCOUNTER = ITEMCOUNTER + 1
except sqlalchemy.exc.IntegrityError as err:
print("[ FAIL ] Duplicate user record '{}'".format(rawitem['_id']))
#print("{}: {}".format(sqlitem, err))
......@@ -174,6 +219,7 @@ for rawitem in MONGO_ITEMS.find().sort('_id', 1):
except:
print("[ FAIL ] Unable to convert user record '{}'".format(rawitem['_id']))
print("* converted total of {:,d} 'user' objects, {:,d} failure(s)".format(ITEMCOUNTER, TOTAL_COUNT - ITEMCOUNTER))
print("[ DONE ] Conversion: 'users'")
#
......@@ -184,6 +230,7 @@ MONGO_ITEMS = MONGOMANAGER.collection('db', 'col_groups')
TOTAL_COUNT = MONGO_ITEMS.count()
print("* found total of {:,d} 'group' objects in MongoDB".format(TOTAL_COUNT))
ITEMCOUNTER = 0
for rawitem in MONGO_ITEMS.find().sort('_id', 1):
try:
mongoitem = AbuseGroup(rawitem)
......@@ -195,9 +242,11 @@ for rawitem in MONGO_ITEMS.find().sort('_id', 1):
print(sqlitem)
SQLSTORAGE.session.add(sqlitem)
#SQLSTORAGE.session.commit()
SQLSTORAGE.session.commit()
SQLGROUPS[sqlitem.name] = sqlitem
ITEMCOUNTER = ITEMCOUNTER + 1
#
# Convert reporting settings for each group.
#
......@@ -237,9 +286,9 @@ for rawitem in MONGO_ITEMS.find().sort('_id', 1):
except Exception as err:
print("[ FAIL ] Unable to convert group record '{}': {}".format(rawitem['_id'], err))
print("* converted total of {:,d} 'group' objects, {:,d} failure(s)".format(ITEMCOUNTER, TOTAL_COUNT - ITEMCOUNTER))
print("[ DONE ] Conversion: 'groups'")
#
# Setup group membership relationships.
#
......@@ -293,62 +342,20 @@ print("[ DONE ] Commit to PostgreSQL")
#-------------------------------------------------------------------------------
#
# Convert event statistics database.
#
if not ARGS.skip_statistics:
print("\n[ BEGIN ] Conversion: 'statistics'")
MONGO_ITEMS = MONGOMANAGER.collection('db_stats', 'col_stats_alerts')
TOTAL_COUNT = MONGO_ITEMS.count()
print("* found total of {:,d} 'event statistics' objects in MongoDB".format(TOTAL_COUNT))
objcounter = 0
for rawitem in MONGO_ITEMS.find().sort('_id', 1):
rawitem = mentat.stats.idea.unescape_stats_full(rawitem)
mongoitem = EventStat(rawitem)
if ARGS.verbose:
print(mongoitem)
sqlitem = eventstatsmodel_from_typeddict(mongoitem)
if ARGS.verbose:
print(sqlitem)
try:
SQLSTORAGE.session.add(sqlitem)
SQLSTORAGE.session.commit()
except sqlalchemy.exc.IntegrityError as err:
print("[ FAIL ] Duplicate event statistics record for interval '{}'".format(sqlitem.interval))
SQLSTORAGE.session.rollback()
objcounter = objcounter + 1
if (objcounter % 1000) == 0:
print("* progress at {}: {:>12,d} ({:>7.3f}%)".format(
datetime.datetime.now(),
objcounter,
(objcounter/TOTAL_COUNT)*100,
))
print("[ DONE ] Conversion: 'statistics'")
else:
print("\n[ SKIP ] Conversion: 'statistics'")
#-------------------------------------------------------------------------------
#
# Convert event reports database.
#
if not ARGS.skip_reports:
print("\n[ BEGIN ] Conversion: 'reports'")
objcounter = 0
mongo_event_reports = MONGOMANAGER.collection('db', 'col_reports')
for rep in mongo_event_reports.find().sort('ts', 1):
rep = mentat.stats.idea.unescape_stats(rep)
mongorep = Report(rep)
MONGO_ITEMS = MONGOMANAGER.collection('db', 'col_reports')
TOTAL_COUNT = MONGO_ITEMS.count()
print("* found total of {:,d} 'event reports' objects in MongoDB".format(TOTAL_COUNT))
ITEMCOUNTER = 0
for rawitem in MONGO_ITEMS.find().sort('ts', 1):
rawitem = mentat.stats.idea.unescape_stats(rawitem)
mongorep = Report(rawitem)
if ARGS.verbose:
print(mongorep)
delta = mongorep['ts_to'] - mongorep['ts_from']
......@@ -363,7 +370,7 @@ if not ARGS.skip_reports:
sqlrep.handle = mongorep['ua_hash']
sqlrep.severity = mongorep['severity']
sqlrep.type = mongorep['type']
sqlrep.message = mongorep['message']
sqlrep.message = encconv(mongorep['message'])
sqlrep.createtime = mongorep['ts']
sqlrep.dt_from = mongorep['ts_from']
......@@ -426,9 +433,13 @@ if not ARGS.skip_reports:
print("{}: {}".format(sqlrep, err))
SQLSTORAGE.session.rollback()
objcounter = objcounter + 1
if (objcounter % 10000) == 0:
print("Progress {}".format(objcounter))
ITEMCOUNTER = ITEMCOUNTER + 1
if (ITEMCOUNTER % 1000) == 0:
print("* progress at {}: {:>12,d} ({:>7.3f}%)".format(
datetime.datetime.now(),
ITEMCOUNTER,
(ITEMCOUNTER/TOTAL_COUNT)*100,
))
print("[ DONE ] Conversion: 'reports'")
......@@ -438,6 +449,53 @@ else:
#-------------------------------------------------------------------------------
#
# Convert event statistics database.
#
if not ARGS.skip_statistics:
print("\n[ BEGIN ] Conversion: 'statistics'")
MONGO_ITEMS = MONGOMANAGER.collection('db_stats', 'col_stats_alerts')
TOTAL_COUNT = MONGO_ITEMS.count()
print("* found total of {:,d} 'event statistics' objects in MongoDB".format(TOTAL_COUNT))
ITEMCOUNTER = 0
for rawitem in MONGO_ITEMS.find().sort('_id', 1):
rawitem = mentat.stats.idea.unescape_stats_full(rawitem)
mongoitem = EventStat(rawitem)
if ARGS.verbose:
print(mongoitem)
sqlitem = eventstatsmodel_from_typeddict(mongoitem)
if ARGS.verbose:
print(sqlitem)
try:
SQLSTORAGE.session.add(sqlitem)
SQLSTORAGE.session.commit()
except sqlalchemy.exc.IntegrityError as err:
print("[ FAIL ] Duplicate event statistics record for interval '{}'".format(sqlitem.interval))
SQLSTORAGE.session.rollback()
ITEMCOUNTER = ITEMCOUNTER + 1
if (ITEMCOUNTER % 1000) == 0:
print("* progress at {}: {:>12,d} ({:>7.3f}%)".format(
datetime.datetime.now(),
ITEMCOUNTER,
(ITEMCOUNTER/TOTAL_COUNT)*100,
))
print("[ DONE ] Conversion: 'statistics'")
else:
print("\n[ SKIP ] Conversion: 'statistics'")
#-------------------------------------------------------------------------------
DT_STOP = datetime.datetime.now()
print("")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment