220 lines
9.1 KiB
Python
220 lines
9.1 KiB
Python
import csv
|
|
from itertools import islice
|
|
from schema import ImpactArea, Blockchain, Topic, Web3, Organization
|
|
from terminusdb_client import WOQLClient
|
|
from datetime import datetime
|
|
import pytz
|
|
import re
|
|
import emoji
|
|
import json
|
|
import meilisearch
|
|
import ast
|
|
import hashlib
|
|
|
|
|
|
# we keep all the information in dictionaries with Employee id as keys
|
|
orgs = {}
|
|
orgsjson = []
|
|
|
|
client = WOQLClient("https://cloud.terminusdb.com/Myseelia/")
|
|
client.connect(db="play", team="Myseelia", use_token=True)
|
|
|
|
client1 = meilisearch.Client(
|
|
'https://ms-9ea4a96f02a8-1969.sfo.meilisearch.io', '117c691a34b21a6651798479ebffd181eb276958')
|
|
|
|
index = client1.index('orgs')
|
|
|
|
def get_emoji_regexp():
|
|
# Sort emoji by length to make sure multi-character emojis are
|
|
# matched first
|
|
emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
|
|
pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
|
|
return re.compile(pattern)
|
|
|
|
|
|
def remove_emojis(string):
|
|
return get_emoji_regexp().sub(r'', string)
|
|
|
|
def hash_string(string):
|
|
sha256 = hashlib.sha256()
|
|
sha256.update(string.encode('utf-8'))
|
|
return sha256.hexdigest()
|
|
|
|
def to_json(obj):
|
|
obj_dict = obj.__dict__
|
|
if obj_dict['blockchainecosystem']:
|
|
print(obj_dict['blockchainecosystem'])
|
|
obj_dict['blockchainecosystem'] = [
|
|
bc.name for bc in obj_dict['blockchainecosystem']]
|
|
else:
|
|
obj_dict['web3'] = None
|
|
if obj_dict['impactarea']:
|
|
print(obj_dict['impactarea'])
|
|
obj_dict['impactarea'] = [ia.name for ia in obj_dict['impactarea']]
|
|
else:
|
|
obj_dict['web3'] = None
|
|
if obj_dict['topic']:
|
|
print(obj_dict['topic'])
|
|
obj_dict['topic'] = [t.name for t in obj_dict['topic']]
|
|
else:
|
|
obj_dict['web3'] = None
|
|
if obj_dict['web3']:
|
|
print(obj_dict['web3'])
|
|
obj_dict['web3'] = [w.name for w in obj_dict['web3']]
|
|
else:
|
|
obj_dict['web3'] = None
|
|
print(obj_dict['datecreated'])
|
|
obj_dict['datecreated'] = obj_dict['datecreated'].isoformat()
|
|
print("here")
|
|
print("here" + json.dumps(obj_dict))
|
|
return json.dumps(obj_dict)
|
|
|
|
|
|
with open("Organizations.csv") as file:
|
|
csv_file = csv.reader(file)
|
|
next(csv_file) # skiping header
|
|
chunk_size = 1000
|
|
while True:
|
|
chunk = list(islice(csv_file, chunk_size))
|
|
if not chunk:
|
|
break
|
|
# Process the chunk of rows here
|
|
counter = 0
|
|
for row in chunk:
|
|
row = [remove_emojis(cell) for cell in row]
|
|
assignee = row[0]
|
|
impactArea = row[4].strip("{}").split(",")
|
|
impact_area_set = set()
|
|
for value in impactArea:
|
|
if value:
|
|
value = value.strip().strip('"')
|
|
if value == "Social justice":
|
|
impact_area_set.add(ImpactArea.SocialJustice)
|
|
elif value in ("Food & Agriculture", "Food & Ag."):
|
|
impact_area_set.add(ImpactArea.FoodAg)
|
|
elif value == "Invest":
|
|
impact_area_set.add(ImpactArea.Politicsactivism)
|
|
elif value == "Politics & activism":
|
|
impact_area_set.add(ImpactArea.Investing)
|
|
elif value == "Innovate":
|
|
impact_area_set.add(ImpactArea.Innovation)
|
|
else:
|
|
impact_area_set.add(ImpactArea[value])
|
|
blockchainEcosystem = row[1].strip("{}").split(",")
|
|
blockchainEcosystem_set = set()
|
|
for value in blockchainEcosystem:
|
|
if value:
|
|
blockchain = value.strip()
|
|
if blockchain == "Binance Smart Chain":
|
|
blockchain = Blockchain.BinanceSmartChain
|
|
elif blockchain == "Regen Network":
|
|
blockchain = Blockchain.RegenNetwork
|
|
elif blockchain == "Energy Web Chain":
|
|
blockchain = Blockchain.EnergyWebChain
|
|
elif blockchain == "Hyperledger Fabric":
|
|
blockchain = Blockchain.HyperledgerFabric
|
|
elif blockchain == "Zero Carbon":
|
|
blockchain = Blockchain.ZeroCarbon
|
|
elif blockchain == "IXO":
|
|
blockchain = Blockchain.ixo
|
|
elif blockchain in ("Not found", "Not sure / still deciding"):
|
|
blockchain = Blockchain.Other
|
|
elif blockchain == "Not applicable":
|
|
break
|
|
else:
|
|
blockchain = Blockchain[blockchain]
|
|
blockchainEcosystem_set.add(blockchain)
|
|
web3 = row[15].strip("{}")
|
|
web3_set = set()
|
|
for value in re.split(",(?![^(]*\))", web3):
|
|
if value:
|
|
web3 = value.strip().strip('"')
|
|
# someone put "Blockchain (L1,DAO" which will match to "Blockchain (L1" since we strip the '"'
|
|
if web3 in ("Blockchain (L1, L2)", "Blockchain (L1,L2)", "Blockchain (L1"):
|
|
web3 = Web3.Blockchain
|
|
else:
|
|
web3 = Web3[web3]
|
|
web3_set.add(web3)
|
|
topic = row[13].strip("{}").split(",")
|
|
topic_set = set()
|
|
for value in topic:
|
|
if value:
|
|
topic = value.strip()
|
|
if topic == "inclusion and equality":
|
|
topic = Topic.inclusionequality
|
|
elif topic == "Circular Economy":
|
|
topic = Topic.CircularEconomy
|
|
elif topic == "Financial Inclusion":
|
|
topic = Topic.Financial_Inclusion
|
|
elif topic == "origin & trace":
|
|
topic = Topic.Traceability
|
|
elif topic == "Supply Chain":
|
|
topic = Topic.SupplyChain
|
|
elif topic == "Move-to-earn":
|
|
topic = Topic.Movetoearn
|
|
elif topic == "Work & Business":
|
|
topic = Topic.Work
|
|
elif topic == "Food Forests":
|
|
topic = Topic.FoodForests
|
|
elif topic == "Eco-Living":
|
|
topic = Topic.EcoLiving
|
|
else:
|
|
topic = Topic[topic]
|
|
topic_set.add(topic)
|
|
date_string = row[2]
|
|
utc_date = datetime.min
|
|
if date_string:
|
|
date_format = "%m/%d/%Y %I:%M %p"
|
|
parsed_date = datetime.strptime(date_string, date_format)
|
|
# Convert the datetime object to UTC time
|
|
utc_date = pytz.utc.normalize(pytz.utc.localize(parsed_date))
|
|
preJan20thUpvotesstr = row[7]
|
|
preJan20thUpvotesint = 0
|
|
if preJan20thUpvotesstr.isdigit():
|
|
preJan20thUpvotesint = int(preJan20thUpvotesstr)
|
|
upvotesstr = row[7]
|
|
upvotesint = 0
|
|
if upvotesstr.isdigit():
|
|
upvotesint = int(upvotesstr)
|
|
|
|
org = Organization(
|
|
assignee=row[0] if row[0] not in [None, ''] else None,
|
|
blockchainecosystem=blockchainEcosystem_set if len(
|
|
blockchainEcosystem_set) > 0 else None,
|
|
description=row[3] if row[3] not in [None, ''] else None,
|
|
logo=row[5] if row[5] not in [None, ''] else None,
|
|
# name is the only mandatory field, so default it to "" if blank
|
|
name=row[6] if row[6] not in [None, ''] else "",
|
|
preJan20thUpvotes=preJan20thUpvotesint if preJan20thUpvotesint not in [
|
|
0] else None,
|
|
reviewed=row[8] if row[3] not in [None, ''] else None,
|
|
submittedbyemail=row[9] if row[9] not in [None, ''] else None,
|
|
submittedbyname=row[10] if row[10] not in [None, ''] else None,
|
|
submittedbyowner=row[11] if row[11] not in [
|
|
None, ''] else None,
|
|
subscribed=row[12] if row[12] not in [None, ''] else None,
|
|
topic=topic_set if len(topic_set) > 0 else None,
|
|
upvotes=upvotesint if upvotesint not in [0] else None,
|
|
web3=web3_set if len(web3_set) > 0 else None,
|
|
impactarea=impact_area_set if len(
|
|
impact_area_set) > 0 else None,
|
|
datecreated=utc_date if impact_area_set not in [
|
|
datetime.min] else None,
|
|
)
|
|
|
|
orgs[counter] = org
|
|
# print(to_json(org))
|
|
# orgsjson.append(to_json(org))
|
|
counter += 1
|
|
inserted = client.insert_document(
|
|
list(orgs.values()), commit_msg="Adding orgs")
|
|
documents = []
|
|
for id in inserted:
|
|
document = client.get_document(id)
|
|
real_id = document['@id']
|
|
num_id = real_id.split("/")[-1]
|
|
document = {k: json.dumps(v) for k, v in document.items() if k != '@id'}
|
|
document.update({'id': num_id})
|
|
documents.append(document)
|
|
index.add_documents(documents)
|