benjamin melançon
a16e50063e
i had 'HW ' come in as a complete surprise not-converted-to-HousingWorks 'unknown' entry.
298 lines
18 KiB
Python
298 lines
18 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import glob
|
|
import re
|
|
import sys
|
|
# Import our local settings management.
|
|
import settings
|
|
|
|
if hasattr(sys, 'ps1'):
|
|
import copy
|
|
debug = True
|
|
else:
|
|
debug = False
|
|
|
|
if settings.pomodoro_logfile():
|
|
# This works for one file:
|
|
timelog = pd.read_csv(settings.pomodoro_logfile())
|
|
else:
|
|
# For multiple files:
|
|
path = settings.pomodoro_logpath()
|
|
all_files = glob.glob(path + "*.csv")
|
|
|
|
li = []
|
|
|
|
for filename in all_files:
|
|
df = pd.read_csv(filename, index_col=None, header=0)
|
|
li.append(df)
|
|
|
|
timelog = pd.concat(li, axis=0, ignore_index=True)
|
|
|
|
if debug:
|
|
imported = copy.deepcopy(timelog)
|
|
|
|
timelog.drop_duplicates(inplace=True)
|
|
|
|
if debug:
|
|
nodupes = copy.deepcopy(timelog)
|
|
|
|
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
|
|
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
|
|
# need to drop the rows where the work task description is blank, which is
|
|
# coming in as not a number for reasons i'm not entirely clear on. Maybe
|
|
# because it's the last row of the spreadsheet? Anyway we cannot do anything
|
|
# with no data in the description, so drop them at the outset.
|
|
# We can allow no data in the 'intention' so define the three columns to check:
|
|
timelog = timelog.dropna(subset=['started', 'recorded', 'description'])
|
|
timelog = timelog.reset_index(drop=True)
|
|
|
|
if debug:
|
|
dropna = copy.deepcopy(timelog)
|
|
|
|
# For debugging, keep originals around.
|
|
timelog["orig_desc"] = timelog["description"]
|
|
timelog["orig_started"] = timelog["started"]
|
|
timelog["orig_recorded"] = timelog["recorded"]
|
|
|
|
# Clean up description before we go to work on it.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Allow multiple entries to be put into one prompt by splitting with semicolon.
|
|
# TODO make this a flag since it's possible to use semicolons without meaning
|
|
# to make multiple task entries at once.
|
|
timelog["description"] = list(timelog["description"].str.split(";"))
|
|
timelog = timelog.explode("description").reset_index()
|
|
|
|
if debug:
|
|
mess = copy.deepcopy(timelog)
|
|
|
|
timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("America/New_York")
|
|
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("America/New_York")
|
|
|
|
latest_recorded = settings.pomodoro_latest_recorded()
|
|
if latest_recorded:
|
|
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
|
|
|
|
timelog["time"] = 30
|
|
# A pomodoro started before 3am Eastern time is considered to be a continuation
|
|
# of the day before, so we are, effectively, on West Coast time for determining
|
|
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
|
|
timelog["date"] = timelog["started"].dt.tz_convert("America/Los_Angeles").dt.date
|
|
timelog['date'] = pd.to_datetime(timelog['date'])
|
|
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
|
|
|
|
# If a project has been specified (task prefixed with a colon), then put the
|
|
# project in its own column.
|
|
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[0], None))
|
|
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[1], timelog['description']))
|
|
|
|
# Mid-work clean up of description and new project.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
timelog['project'] = timelog['project'].str.strip()
|
|
|
|
timelog['tmp_timeshift'] = timelog['description'].str.extract(r'^(\(.+?)\)', expand=False)
|
|
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].str.strip().str.replace("(","", regex=False)
|
|
# In an ideal world we would use https://github.com/bear/parsedatetime or similar and
|
|
# even better figure out the right date for strings like "Monday" but eh this'll do.
|
|
timeshift_days = {
|
|
-1: ['one day ago', '1 day ago', 'yesterday'],
|
|
-2: ['two days ago', '2 days ago', 'day before yesterday', 'the day before yesterday'],
|
|
-3: ['three days ago', '3 days ago'],
|
|
-4: ['four days ago', '4 days ago'],
|
|
-5: ['five days ago', '5 days ago'],
|
|
}
|
|
for days, phrases in timeshift_days.items():
|
|
phrases.append(str(days))
|
|
timelog.loc[timelog.tmp_timeshift.str.lower().isin(phrases), "tmp_daysdelta"] = int(days)
|
|
timelog['tmp_daysdelta'] = timelog['tmp_daysdelta'].fillna(0)
|
|
timelog['date'] = timelog['date'] + pd.to_timedelta(timelog['tmp_daysdelta'], unit='D')
|
|
timelog.tmp_timeshift = '(' + timelog.tmp_timeshift + ')'
|
|
# timelog['description'] = (np.where(timelog['tmp_daysdelta'] > 0.0, timelog['description'].str.replace(timelog['tmp_timeshift'], ""), timelog['description']))
|
|
# timelog.description.replace(regex=r'(?i)' + timelog.tmp_timeshift, value="")
|
|
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].fillna("")
|
|
timelog['description'] = timelog.apply(lambda x: x['description'].replace(x['tmp_timeshift'], ''), axis=1)
|
|
|
|
# Be sure to trim any whitespace before we regex for ending on asterisk number.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# If a multiplier has been provided (an asterisk and an integer at the end of a
|
|
# task), then multiply the time by it and remove it from the description.
|
|
# Ensure we're splitting on the same asterisk we found: Use the end of string
|
|
# signifier in the regular expression ($), and split from the right.
|
|
p = re.compile(r'\*\s*\d+$')
|
|
# On some systems, using np.where worked but others failed. Why it worked is
|
|
# unknown but why it failed is because numpy where evaluates all parts, even
|
|
# the parts that will never get used because the where clause does not apply!
|
|
# This caused the chained strings to fail because— no string.
|
|
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
|
|
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
|
|
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
|
|
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
|
|
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
|
|
timelog.drop(columns=['tmp_multiplier'], inplace=True)
|
|
|
|
# Clean up description again, after it has been sliced and diced.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Specific tasks are expanded from items in list on right into project-task combo on left.
|
|
compound_project_tasks = {
|
|
"Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"],
|
|
"Find It Cambridge — Contributing back to the community": ["Find It Contrib"],
|
|
"Find It Cambridge — Planning": ["Find It project management"],
|
|
"Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"],
|
|
"Internal — Conferences & Meetups": ["Conference", "Cons", "Camps", "Conferences", "meetup", "meetups"], # Note camp is not here because that maps to Near North camp
|
|
"Internal — Content": ["Agaric site content", "Agaric content", "content", "blog", "blogging", "writing", "content writing"],
|
|
"Internal — Documentation": ["documentation", "docs", "documenting"],
|
|
"Internal — Marketing": ["marketing", "Agaric marketing", "promotion", "Agaric promotion"],
|
|
"Internal — Other": ["other"],
|
|
"Internal — Overhead": ["overhead"],
|
|
"Internal — Planning": ["planning", "plan"],
|
|
"Internal — Personal Learning": ["Learning", "Personal learning"],
|
|
"Internal — Presentations": ["presentations", "presentation"],
|
|
"Internal — Network Engagement": ["Network Engagement", "NE", "network engagment", "Social media", "Network building", "Agaric network engagement", "AgaricNetwork Engagement", "Networking"],
|
|
"VHFA — Contributing back to the community": ["VHFA contrib"],
|
|
}
|
|
for preferred, alternatives in compound_project_tasks.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# If a compound project was specified, break that out into a sub-project (in
|
|
# Harvest, we use Task, which is really task type, for this).
|
|
timelog['subproject'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', n=1).str[1], None))
|
|
timelog['project'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', n=1).str[0], timelog['project']))
|
|
|
|
# Trim any surrounding whitespace from final project and sub-project/task.
|
|
timelog['subproject'] = timelog['subproject'].str.strip()
|
|
timelog['project'] = timelog['project'].str.strip()
|
|
|
|
# Replace irregular-but-known project names with ones timetracking tools use.
|
|
harvest_project_names = {
|
|
"Everyday AI Project": ["EverydayAI", "MIT Scheller Teacher Education Program (STEP) Lab", "Education Arcade", "Everyday AI", "Everday AI", "EA"],
|
|
"Boston Modern Orchestra Project": ["BMOP", "BMOP.org"],
|
|
"crla.org - Development & Support": ["CRLA.org upgrade", "CRLA", "CRLA upgrade", "California Rural Legal Assistance", "crla.org"],
|
|
"Cockrill Precision Products": ["Cockrill Corp", "Cockrill"],
|
|
"Contratados.org": ["Contratados", "RADCAT", "Research Action Design LLC"],
|
|
"Cultura Continued Support": ["Cultura", "MIT Cultura"],
|
|
"Drutopia": ["Drutopia improvements", "Drutopia overhead", "Drutapia"],
|
|
"EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"],
|
|
"Eliot School Site & CRM": ["Eliot", "Eliot School"],
|
|
"encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"],
|
|
"endTB website support": ["PIH", "Partners In Health", "endTB", "endtb.org", "endtb support"],
|
|
"ExperienceOlympic.com": ["Experience Olympic", "ExperienceOlympic", "Experience Olympia", "EO", "olympic", "ExpOlympic", "expeirenec olympic", "Experience Olympic LLC"],
|
|
"Family & Home": ["Family and Home", "Family home"],
|
|
"Find It Cambridge": ["Find It", "FIC", "Cambridge", "FindIt", "FindIt Cambridge"],
|
|
"GBCLT.org": ["Gertrude Brown Community Land Trust", "GBCLT", "GB", "Gertrude", "Gertrude Brown", "CLT", "Land Trust", "Community Land Trust", "GB CLT"],
|
|
"GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"],
|
|
"Green Calendar": ["Action Information, Inc.", "Action Information", "GreenCalendar", "Action Info", "GC", "green cal", "greencal", "actioninfo", "actinfo", "Action Informaton", "AI"],
|
|
"HousingWorks": ["HousingWorks, Inc", "HousingWorks.net", "HW", "Housing Works", "hwerkz"],
|
|
"hoptothebeat and beantowncamp": ["Hop To The Beat", "hoptothebeat", "beanttowncamp", "hopto", "beantown", "HTTB"],
|
|
"Immigrant Navigator": ["IFSI", "Immigrant Family Services"],
|
|
"Internal": ["Agaric", "Agaric internal"],
|
|
"Jacket2 Drupal Website upgrade": ["University of Pennsylvania", "UPenn", "Jacket2", "jacket"],
|
|
"Kalamuna subcontracting": ["Kalamuna Inc", "Kalamuna"],
|
|
"Kick Big Polluters Out Website": ["Kick Big Polluters Out", "KBPO", "KOBP", "Rosa Luxemburg Stiftung", "Rosa Luxemburg"],
|
|
"Leads": ["Lead", "Agaric leads", "Lead followups"],
|
|
"lwcjustice.org": ["Longshore Workers Coalition", "LWC", "Longshore Workers", "lwcjustice", "lwc justice", "Longshore", "longshoreworkers"],
|
|
"MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"],
|
|
"monasteriesoftheheart.org support": ["Monasteries of the Heart", "MOH", "monastery"],
|
|
"Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"],
|
|
"NICHQ Data Upgrade": ["NICHQ Data"],
|
|
"NICHQ Support": ["NICHQ", "NICHQ maintenance", "NICHQ Community"],
|
|
"NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"],
|
|
"North Carolina Housing Finance Agency": ["NCHFA", "NC HFA", "North Carolina", "NC Housing", "North Carolina Housing", "North Carolina HFA", "NC Housing Finance Agency"],
|
|
"OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"],
|
|
"Patient HM Brain Science Website": ["Patient HM", "patientHM"],
|
|
"PDX Contemporary Art Upgrade to D10": ["PDX Contemporary Art", "PDX Art", "Portland Art", "PDX Art Museum"],
|
|
"PECE migration to Drupal 10": ["PECE", "PECE migration", "PECE D10", "Platform for Experimental, Collaborative Ethnography", "University of California, Irvine"],
|
|
"Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"],
|
|
"SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"],
|
|
"Making Sense of Models (Teachers with Guts)": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS", "Making Sense of Models", "Project GUTS/TWIG/Making Sense of Models"],
|
|
"SaharaReporters.com": ["Sahara Reporters", "Sahara", "SaharaReporters", "Sahara Network", "SR", "Sahara Reporter"],
|
|
"The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"],
|
|
"Therapy Fidelity App - Development": ["Tulane", "Therapy Fidelity App"],
|
|
"Type Network": ["TypeNetwork", "TN", "Tyye Network", "Type Netork"],
|
|
"VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"],
|
|
"Vulk redesign": ["Vulk", "Vulk.coop"],
|
|
"Virtual safe space (VSS) and eReferral Pathways (eRPW)": ["UNICEF", "VSS", "eRPW", "VSS+eRPW", "VSS + eRPW", "VSS and ERPW", "unicef code review", "virtual safe spaces", "ereferral", "laaha", "laaha.org"],
|
|
"Visions Unite": ["VU", "VisionsUnite"],
|
|
"Winchester Find It demo site": ["Winchester", "Town Common", "Winchester Town Common", "Find It Winchester", "findit Winchester", "towncommon"],
|
|
"C-Team support": ["ZEIT", "ZEIT ONLINE", "ZEIT Upgrade", "Zeit D9"],
|
|
}
|
|
other_project_names = {
|
|
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Camp", "Near North", "Encampment Defense"],
|
|
"Personal": ["Personal/external", "Personal / external", "External"],
|
|
"Tzedakah": ["Community support"],
|
|
"PWGD": ["People Who Give a Damn", "PWGD Inc"],
|
|
"Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"],
|
|
"Solidarity Network": ["SolNet"],
|
|
}
|
|
|
|
replacement_project_names = harvest_project_names | other_project_names
|
|
|
|
for preferred, alternatives in replacement_project_names.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
|
|
# Development is the default and never specified.
|
|
subproject_names = {
|
|
"Contributing back to the community": ["contrib", "contributing", "contributions"],
|
|
"Not billed": ["nb"],
|
|
"Planning": ["plan", "meeting", "pm", "project management", "plannng"],
|
|
"Quality Assurance": ["qa", "quality"],
|
|
}
|
|
for preferred, alternatives in subproject_names.items():
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred
|
|
# TODO flag when task/subproject names are not known, because otherwise they
|
|
# get force-created on import.
|
|
|
|
# Default task for Leads project should be 'Leads' (not Development as will be filled in below).
|
|
timelog.loc[(timelog.project=="Leads") & timelog.subproject.isna(), "subproject"] = "Leads"
|
|
# The above could also have been done in the compound_project_tasks
|
|
# Leaving this here as an example.
|
|
|
|
# Condense duplicate entries by date, summing the minutes spent, and listing
|
|
# the first started and last recorded times for each task.
|
|
# The fillna is essential or we drop entries with blank ('None') projects.
|
|
tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
|
|
|
|
# We're doing the final conversion to Harvest as a separate step because we
|
|
# want to factor out all of the above non-Harvest-specific logic.
|
|
|
|
latest = tl.recorded.max()
|
|
datest = tl.date.max().strftime('%Y-%m-%d')
|
|
|
|
# Separate Harvest from non-Harvest projects, and also filter out any blank
|
|
# projects, but save those too for a CSV of the excluded items.
|
|
hrvst = tl[tl.project.isin(harvest_project_names.keys())]
|
|
other = tl[tl.project.isin(other_project_names.keys())]
|
|
unknown = tl[~tl.project.isin(replacement_project_names.keys())]
|
|
|
|
harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'})
|
|
harvest["Hours"] = harvest["time"]/60
|
|
harvest["First name"] = "Benjamin"
|
|
harvest["Last name"] = "Melançon"
|
|
project_client_mapping = settings.harvest_get_projects_clients_map()
|
|
harvest["Client"] = harvest["Project"].map(project_client_mapping)
|
|
harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True)
|
|
|
|
if not debug:
|
|
harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False)
|
|
other.to_csv('personal-other-' + datest + '.csv', index=False)
|
|
unknown.to_csv('unknown-' + datest + '.csv', index=False)
|
|
settings.pomodoro_latest_recorded(latest)
|
|
else:
|
|
hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60
|
|
other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60
|
|
unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60
|
|
print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")
|