Benjamin Melançon
26e29d3157
But what we learned is we have no way to expand 'NICHQ.org' into 'NICHQ Support: NICHQ.org work on' or something like that, followed by whatever i put in, but that's not really a flaw of our parse script here but our failure to have come up with an official way to track work on different projects within the one support contract.
228 lines
12 KiB
Python
228 lines
12 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import glob
|
|
import re
|
|
import sys
|
|
# Import our local settings management.
|
|
import settings
|
|
|
|
if hasattr(sys, 'ps1'):
|
|
import copy
|
|
debug = True
|
|
else:
|
|
debug = False
|
|
|
|
if settings.pomodoro_logfile():
|
|
# This works for one file:
|
|
timelog = pd.read_csv(settings.pomodoro_logfile())
|
|
else:
|
|
# For multiple files:
|
|
path = settings.pomodoro_logpath()
|
|
all_files = glob.glob(path + "*.csv")
|
|
|
|
li = []
|
|
|
|
for filename in all_files:
|
|
df = pd.read_csv(filename, index_col=None, header=0)
|
|
li.append(df)
|
|
|
|
timelog = pd.concat(li, axis=0, ignore_index=True)
|
|
|
|
if debug:
|
|
imported = copy.deepcopy(timelog)
|
|
|
|
timelog.drop_duplicates(inplace=True)
|
|
|
|
if debug:
|
|
nodupes = copy.deepcopy(timelog)
|
|
|
|
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
|
|
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
|
|
# need to drop the rows where the work task description is blank, which is
|
|
# coming in as not a number for reasons i'm not entirely clear on. Maybe
|
|
# because it's the last row of the spreadsheet? Anyway we cannot do anything
|
|
# with no data in the description, so drop them at the outset.
|
|
# We can allow no data in the 'intention' so define the three columns to check:
|
|
timelog = timelog.dropna(subset=['started', 'recorded', 'description'])
|
|
timelog = timelog.reset_index(drop=True)
|
|
|
|
if debug:
|
|
dropna = copy.deepcopy(timelog)
|
|
|
|
# For debugging, keep originals around.
|
|
timelog["orig_desc"] = timelog["description"]
|
|
timelog["orig_started"] = timelog["started"]
|
|
timelog["orig_recorded"] = timelog["recorded"]
|
|
|
|
# Clean up description before we go to work on it.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Allow multiple entries to be put into one prompt by splitting with semicolon.
|
|
# TODO make this a flag since it's possible to use semicolons without meaning
|
|
# to make multiple task entries at once.
|
|
timelog["description"] = list(timelog["description"].str.split(";"))
|
|
timelog = timelog.explode("description").reset_index()
|
|
|
|
if debug:
|
|
mess = copy.deepcopy(timelog)
|
|
|
|
timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("US/Eastern")
|
|
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern")
|
|
|
|
latest_recorded = settings.pomodoro_latest_recorded()
|
|
if latest_recorded:
|
|
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
|
|
|
|
timelog["time"] = 30
|
|
# A pomodoro started before 3am Eastern time is considered to be a continuation
|
|
# of the day before, so we are, effectively, on West Coast time for determining
|
|
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
|
|
timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date
|
|
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
|
|
|
|
# If a project has been specified (task prefixed with a colon), then put the
|
|
# project in its own column.
|
|
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None))
|
|
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description']))
|
|
|
|
# Mid-work clean up of description and new project.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
timelog['project'] = timelog['project'].str.strip()
|
|
|
|
# If a multiplier has been provided (an asterisk and an integer at the end of a
|
|
# task), then multiply the time by it and remove it from the description.
|
|
# Ensure we're splitting on the same asterisk we found: Use the end of string
|
|
# signifier in the regular expression ($), and split from the right.
|
|
p = re.compile(r'\*\s*\d$')
|
|
# On some systems, using np.where worked but others failed. Why it worked is
|
|
# unknown but why it failed is because numpy where evaluates all parts, even
|
|
# the parts that will never get used because the where clause does not apply!
|
|
# This caused the chained strings to fail because— no string.
|
|
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
|
|
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
|
|
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
|
|
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
|
|
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
|
|
timelog.drop(columns=['tmp_multiplier'], inplace=True)
|
|
|
|
# Clean up description again, after it has been sliced and diced.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Mostly historical, helper for breaking compound project-tasks into their constituent parts
|
|
compound_project_tasks = {
|
|
"Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"],
|
|
"Find It Cambridge — Contributing back to the community": ["Find It Contrib"],
|
|
"Find It Cambridge — Planning": ["Find It project management"],
|
|
"Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"],
|
|
"Internal — Network Engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"],
|
|
"Internal — Content": ["Agaric site content", "Agaric content"],
|
|
"VHFA — Contributing back to the community": ["VHFA contrib"],
|
|
}
|
|
for preferred, alternatives in compound_project_tasks.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# If a compound project was specified, break that out into a sub-project (in
|
|
# Harvest, we use Task, which is really task type, for this.
|
|
timelog['subproject'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', 1).str[1], None))
|
|
timelog['project'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', 1).str[0], timelog['project']))
|
|
|
|
# Replace irregular-but-known project names with ones timetracking tools use.
|
|
harvest_project_names = {
|
|
"Boston Modern Orchestra Project": ["BMOP", "BMOP.org"],
|
|
"CRLA.org upgrade": ["CRLA", "CRLA upgrade"],
|
|
"Cockrill Precision Products": ["Cockrill Corp", "Cockrill"],
|
|
"Cultura Continued Support": ["Cultura", "MIT Cultura"],
|
|
"Drutopia": ["Drutopia improvements", "Drutopia overhead"],
|
|
"EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"],
|
|
"Eliot School Site & CRM": ["Eliot", "Eliot School"],
|
|
"encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"],
|
|
"Family & Home": ["Family and Home", "Family home"],
|
|
"Find It Cambridge": ["Find It", "FIC", "Cambridge"],
|
|
"GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"],
|
|
"Immigrant Navigator": ["IFSI", "Immigrant Family Services"],
|
|
"Internal": ["Agaric", "Agaric internal"],
|
|
"Leads": ["Lead", "Agaric leads", "Lead followups"],
|
|
"Internal: Personal Learning": ["Learning", "Personal learning"],
|
|
"MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"],
|
|
"Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"],
|
|
"NICHQ Data Upgrade": ["NICHQ Data"],
|
|
"NICHQ Support": ["NICHQ", "NICHQ maintenance"],
|
|
"NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"],
|
|
"OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"],
|
|
"Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"],
|
|
"SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"],
|
|
"Project GUTS/TWIG/Making Sense of Models": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS"],
|
|
"The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"],
|
|
"VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"],
|
|
"Vulk redesign": ["Vulk", "Vulk.coop"],
|
|
}
|
|
other_project_names = {
|
|
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Camp", "Near North"],
|
|
"Personal": ["Personal/external", "Personal / external", "External"],
|
|
"Tzedakah": ["Community support"],
|
|
"PWGD": ["People Who Give a Damn", "PWGD Inc"],
|
|
"Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"],
|
|
}
|
|
|
|
replacement_project_names = harvest_project_names | other_project_names
|
|
|
|
for preferred, alternatives in replacement_project_names.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
|
|
# Development is the default and never specified.
|
|
subproject_names = {
|
|
"Contributing back to the community": ["contrib", "contributing", "contributions"],
|
|
"Not billed": ["nb"],
|
|
"Planning": ["plan", "meeting", "pm", "project management"],
|
|
}
|
|
for preferred, alternatives in subproject_names.items():
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred
|
|
|
|
# Condense duplicate entries by date, summing the minutes spent, and listing
|
|
# the first started and last recorded times for each task.
|
|
# The fillna is essential or we drop entries with blank ('None') projects.
|
|
tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
|
|
|
|
# We're doing the final conversion to Harvest as a separate step because we
|
|
# want to factor out all of the above non-Harvest-specific logic.
|
|
|
|
latest = tl.recorded.max()
|
|
datest = str(tl.date.max())
|
|
|
|
# Separate Harvest from non-Harvest projects, and also filter out any blank
|
|
# projects, but save those too for a CSV of the excluded items.
|
|
hrvst = tl[tl.project.isin(harvest_project_names.keys())]
|
|
other = tl[tl.project.isin(other_project_names.keys())]
|
|
unknown = tl[~tl.project.isin(replacement_project_names.keys())]
|
|
|
|
harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'})
|
|
harvest["Hours"] = harvest["time"]/60
|
|
harvest["First name"] = "Benjamin"
|
|
harvest["Last name"] = "Melançon"
|
|
project_client_mapping = settings.harvest_get_projects_clients_map()
|
|
harvest["Client"] = harvest["Project"].map(project_client_mapping)
|
|
harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True)
|
|
|
|
if not debug:
|
|
harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False)
|
|
other.to_csv('personal-other-' + datest + '.csv', index=False)
|
|
unknown.to_csv('unknown-' + datest + '.csv', index=False)
|
|
settings.pomodoro_latest_recorded(latest)
|
|
else:
|
|
hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60
|
|
other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60
|
|
unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60
|
|
print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")
|