2021-04-27 13:41:44 +00:00
import pandas as pd
2021-04-28 01:45:37 +00:00
import numpy as np
2021-05-25 19:57:52 +00:00
import glob
2021-05-03 02:38:54 +00:00
import re
2021-05-03 04:04:18 +00:00
import sys
# Import our local settings management.
2021-05-02 22:39:37 +00:00
import settings
2021-04-27 13:41:44 +00:00
2021-05-28 18:00:36 +00:00
if hasattr ( sys , ' ps1 ' ) :
2021-05-27 12:19:56 +00:00
import copy
debug = True
else :
debug = False
2021-05-28 18:21:52 +00:00
if settings . pomodoro_logfile ( ) :
# This works for one file:
timelog = pd . read_csv ( settings . pomodoro_logfile ( ) )
else :
# For multiple files:
path = settings . pomodoro_logpath ( )
all_files = glob . glob ( path + " *.csv " )
li = [ ]
for filename in all_files :
df = pd . read_csv ( filename , index_col = None , header = 0 )
li . append ( df )
timelog = pd . concat ( li , axis = 0 , ignore_index = True )
2021-05-25 19:57:52 +00:00
2021-05-27 12:19:56 +00:00
if debug :
imported = copy . deepcopy ( timelog )
2021-05-25 19:57:52 +00:00
2021-05-26 16:48:36 +00:00
timelog . drop_duplicates ( inplace = True )
2021-04-27 13:41:44 +00:00
2021-05-27 12:19:56 +00:00
if debug :
nodupes = copy . deepcopy ( timelog )
2021-04-28 03:37:37 +00:00
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
2021-05-27 12:19:56 +00:00
# We can allow no data in the 'intention' so define the three columns to check:
timelog = timelog . dropna ( subset = [ ' started ' , ' recorded ' , ' description ' ] )
2021-04-28 03:37:37 +00:00
timelog = timelog . reset_index ( drop = True )
2021-05-27 12:19:56 +00:00
if debug :
dropna = copy . deepcopy ( timelog )
# For debugging, keep originals around.
timelog [ " orig_desc " ] = timelog [ " description " ]
timelog [ " orig_started " ] = timelog [ " started " ]
timelog [ " orig_recorded " ] = timelog [ " recorded " ]
2021-04-28 04:14:48 +00:00
# Clean up description before we go to work on it.
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 04:16:15 +00:00
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
2021-04-28 04:14:48 +00:00
timelog [ " description " ] = list ( timelog [ " description " ] . str . split ( " ; " ) )
timelog = timelog . explode ( " description " ) . reset_index ( )
2021-05-27 12:19:56 +00:00
if debug :
mess = copy . deepcopy ( timelog )
2021-05-28 18:24:07 +00:00
timelog [ " started " ] = pd . to_datetime ( timelog [ " started " ] , errors = ' coerce ' ) . dt . tz_convert ( " US/Eastern " )
2021-04-27 15:30:53 +00:00
timelog [ " recorded " ] = pd . to_datetime ( timelog [ " recorded " ] ) . dt . tz_convert ( " US/Eastern " )
2021-05-03 04:01:59 +00:00
latest_recorded = settings . pomodoro_latest_recorded ( )
if latest_recorded :
2021-05-03 04:13:07 +00:00
timelog = timelog [ timelog . recorded > pd . to_datetime ( latest_recorded ) ]
2021-05-03 04:01:59 +00:00
2021-04-27 15:06:20 +00:00
timelog [ " time " ] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog [ " date " ] = timelog [ " started " ] . dt . tz_convert ( " US/Pacific " ) . dt . date
2021-12-18 05:04:52 +00:00
timelog [ ' date ' ] = pd . to_datetime ( timelog [ ' date ' ] )
2021-04-27 15:31:27 +00:00
timelog [ " day_of_week " ] = pd . to_datetime ( timelog [ " date " ] ) . dt . day_name ( )
2021-04-27 15:32:34 +00:00
2021-04-28 02:30:07 +00:00
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
2021-04-28 01:45:37 +00:00
timelog [ ' project ' ] = ( np . where ( timelog [ ' description ' ] . str . contains ( ' : ' ) , timelog [ ' description ' ] . str . split ( ' : ' , 1 ) . str [ 0 ] , None ) )
timelog [ ' description ' ] = ( np . where ( timelog [ ' description ' ] . str . contains ( ' : ' ) , timelog [ ' description ' ] . str . split ( ' : ' , 1 ) . str [ 1 ] , timelog [ ' description ' ] ) )
2021-04-28 15:05:02 +00:00
# Mid-work clean up of description and new project.
2021-04-28 02:40:15 +00:00
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 15:05:02 +00:00
timelog [ ' project ' ] = timelog [ ' project ' ] . str . strip ( )
2021-04-28 02:40:15 +00:00
2022-03-04 23:33:43 +00:00
timelog [ ' tmp_timeshift ' ] = timelog [ ' description ' ] . str . extract ( r ' ^( \ (.+?) \ ) ' , expand = False )
2021-12-18 05:00:12 +00:00
timelog [ ' tmp_timeshift ' ] = timelog [ ' tmp_timeshift ' ] . str . strip ( ) . str . replace ( " ( " , " " , regex = False )
2021-12-12 17:19:07 +00:00
# In an ideal world we would use https://github.com/bear/parsedatetime or similar and
# even better figure out the right date for strings like "Monday" but eh this'll do.
timeshift_days = {
- 1 : [ ' one day ago ' , ' 1 day ago ' , ' yesterday ' ] ,
- 2 : [ ' two days ago ' , ' 2 days ago ' , ' day before yesterday ' , ' the day before yesterday ' ] ,
- 3 : [ ' three days ago ' , ' 3 days ago ' ] ,
- 4 : [ ' four days ago ' , ' 4 days ago ' ] ,
- 5 : [ ' five days ago ' , ' 5 days ago ' ] ,
}
for days , phrases in timeshift_days . items ( ) :
phrases . append ( str ( days ) )
timelog . loc [ timelog . tmp_timeshift . str . lower ( ) . isin ( phrases ) , " tmp_daysdelta " ] = int ( days )
timelog [ ' tmp_daysdelta ' ] = timelog [ ' tmp_daysdelta ' ] . fillna ( 0 )
timelog [ ' date ' ] = timelog [ ' date ' ] + pd . to_timedelta ( timelog [ ' tmp_daysdelta ' ] , unit = ' D ' )
2021-12-12 20:44:19 +00:00
timelog . tmp_timeshift = ' ( ' + timelog . tmp_timeshift + ' ) '
# timelog['description'] = (np.where(timelog['tmp_daysdelta'] > 0.0, timelog['description'].str.replace(timelog['tmp_timeshift'], ""), timelog['description']))
2021-12-17 22:07:20 +00:00
# timelog.description.replace(regex=r'(?i)' + timelog.tmp_timeshift, value="")
2021-12-18 04:22:00 +00:00
timelog [ ' tmp_timeshift ' ] = timelog [ ' tmp_timeshift ' ] . fillna ( " " )
timelog [ ' description ' ] = timelog . apply ( lambda x : x [ ' description ' ] . replace ( x [ ' tmp_timeshift ' ] , ' ' ) , axis = 1 )
2021-12-12 17:19:07 +00:00
2021-04-28 02:30:38 +00:00
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
2021-04-28 03:37:37 +00:00
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
2021-05-03 02:38:54 +00:00
p = re . compile ( r ' \ * \ s* \ d$ ' )
# On some systems, using np.where worked but others failed. Why it worked is
# unknown but why it failed is because numpy where evaluates all parts, even
# the parts that will never get used because the where clause does not apply!
# This caused the chained strings to fail because— no string.
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog [ ' tmp_multiplier ' ] = timelog [ ' description ' ] . apply ( lambda x : x . rsplit ( ' * ' , 1 ) [ 1 ] . strip ( ) if p . search ( x ) else 1 )
timelog [ ' description ' ] = timelog [ ' description ' ] . apply ( lambda x : x . rsplit ( ' * ' , 1 ) [ 0 ] if p . search ( x ) else x )
2021-04-28 03:37:37 +00:00
timelog [ " time " ] = timelog [ " time " ] * timelog [ ' tmp_multiplier ' ] . astype ( int )
2021-04-28 03:51:26 +00:00
timelog . drop ( columns = [ ' tmp_multiplier ' ] , inplace = True )
2021-04-28 03:37:37 +00:00
# Clean up description again, after it has been sliced and diced.
2021-04-28 02:40:15 +00:00
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 01:45:37 +00:00
2021-12-06 12:59:02 +00:00
# Specific tasks are expanded from items in list on right into project-task combo on left.
2021-06-18 20:28:13 +00:00
compound_project_tasks = {
" Drutopia — Contributing back to the community " : [ " Drutopia contrib " , " Drutopia contributing " , " Drutopia contributions " ] ,
" Find It Cambridge — Contributing back to the community " : [ " Find It Contrib " ] ,
" Find It Cambridge — Planning " : [ " Find It project management " ] ,
" Internal — Contributing back to the community " : [ " Contrib " , " Agaric: contrib " , " Contributing " , " Agaric contrib " , " Agaric contributions " ] ,
2021-10-15 21:04:45 +00:00
" Internal — Conferences & Meetups " : [ " Conference " , " Camps " , " Camp " , " Conferences " , " meetup " , " meetups " ] ,
2022-03-11 21:38:13 +00:00
" Internal — Content " : [ " content " , " blog " , " blogging " , " writing " ] ,
2021-12-06 12:59:02 +00:00
" Internal — Documentation " : [ " documentation " , " docs " , " documenting " ] ,
2022-03-08 02:41:59 +00:00
" Internal — Other " : [ " other " ] ,
" Internal — Overhead " : [ " overhead " ] ,
2021-10-15 21:04:45 +00:00
" Internal — Personal Learning " : [ " Learning " , " Personal learning " ] ,
" Internal — Presentations " : [ " presentations " , " presentation " ] ,
2022-02-06 04:40:34 +00:00
" Internal — Network Engagement " : [ " Network Engagement " , " network engagement " , " Network engagment " , " Social media " , " Network building " , " Agaric network engagement " , " AgaricNetwork Engagement " ] ,
2021-06-18 20:28:13 +00:00
" Internal — Content " : [ " Agaric site content " , " Agaric content " ] ,
" VHFA — Contributing back to the community " : [ " VHFA contrib " ] ,
}
for preferred , alternatives in compound_project_tasks . items ( ) :
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives . append ( preferred )
alternatives = [ item . lower ( ) for item in alternatives ]
timelog . loc [ timelog . project . str . lower ( ) . isin ( alternatives ) , " project " ] = preferred
# If a compound project was specified, break that out into a sub-project (in
# Harvest, we use Task, which is really task type, for this.
2021-06-18 21:12:11 +00:00
timelog [ ' subproject ' ] = ( np . where ( timelog [ ' project ' ] . str . contains ( ' — ' ) , timelog [ ' project ' ] . str . split ( ' — ' , 1 ) . str [ 1 ] , None ) )
timelog [ ' project ' ] = ( np . where ( timelog [ ' project ' ] . str . contains ( ' — ' ) , timelog [ ' project ' ] . str . split ( ' — ' , 1 ) . str [ 0 ] , timelog [ ' project ' ] ) )
2021-06-18 20:28:13 +00:00
2021-04-28 15:14:39 +00:00
# Replace irregular-but-known project names with ones timetracking tools use.
2021-06-02 00:33:23 +00:00
harvest_project_names = {
2021-06-01 15:23:05 +00:00
" Boston Modern Orchestra Project " : [ " BMOP " , " BMOP.org " ] ,
" CRLA.org upgrade " : [ " CRLA " , " CRLA upgrade " ] ,
2021-06-02 09:36:41 +00:00
" Cockrill Precision Products " : [ " Cockrill Corp " , " Cockrill " ] ,
2021-06-02 17:12:09 +00:00
" Cultura Continued Support " : [ " Cultura " , " MIT Cultura " ] ,
2021-06-01 15:23:05 +00:00
" Drutopia " : [ " Drutopia improvements " , " Drutopia overhead " ] ,
2021-06-02 09:36:41 +00:00
" EC Connect " : [ " eccconectcolorado.org " , " Denver Econnect " , " Denver Early Childhood " , " ECconnect " , " ECconnectColorado " ] ,
2021-06-02 18:16:48 +00:00
" Eliot School Site & CRM " : [ " Eliot " , " Eliot School " ] ,
2021-06-02 17:12:09 +00:00
" encuentro 5 sites " : [ " Encuentro5 " , " e5 " , " Encuentro " ] ,
2021-05-28 18:38:09 +00:00
" Family & Home " : [ " Family and Home " , " Family home " ] ,
2021-10-03 15:19:07 +00:00
" Find It Cambridge " : [ " Find It " , " FIC " , " Cambridge " , " FindIt " , " FindIt Cambridge " ] ,
2021-06-01 15:23:05 +00:00
" GEO Support " : [ " GEO " , " GEO.coop " , " Grassroots Economic Organizing " ] ,
2022-02-12 03:03:27 +00:00
" Green Calendar " : [ " Action Information, Inc. " , " Action Information " , " GreenCalendar " ] ,
2021-06-02 09:36:41 +00:00
" Immigrant Navigator " : [ " IFSI " , " Immigrant Family Services " ] ,
2021-06-01 15:16:06 +00:00
" Internal " : [ " Agaric " , " Agaric internal " ] ,
2021-06-01 15:06:02 +00:00
" Leads " : [ " Lead " , " Agaric leads " , " Lead followups " ] ,
2021-06-01 15:03:45 +00:00
" MASS Continuous Improvement " : [ " MASS Design Group " , " MASS " , " MASS Design " ] ,
2021-06-02 14:45:34 +00:00
" Metalwerx Maintenance " : [ " Metalwerx " , " Metalwerx.com " ] ,
2021-06-01 15:03:45 +00:00
" NICHQ Data Upgrade " : [ " NICHQ Data " ] ,
2021-10-09 02:38:36 +00:00
" NICHQ Support " : [ " NICHQ " , " NICHQ maintenance " , " NICHQ Community " ] ,
2021-06-01 20:39:49 +00:00
" NICHQ FL CMS LAN " : [ " FL CMS LAN " , " flcmslan " , " NICHQ FLCMSLAN " ] ,
2021-06-02 17:12:09 +00:00
" OAG - Office of Opportunity and Achievement Gaps Task Force " : [ " Boston Public Schools " , " BPS " , " OAG " ] ,
2021-12-06 12:59:14 +00:00
" Patient HM Brain Science Website " : [ " Patient HM " , " patientHM " ] ,
2021-06-02 09:36:41 +00:00
" Portside " : [ " Portside.org Improvements 2020 " , " portside.org " , " Portside support " ] ,
2021-06-01 15:03:45 +00:00
" SCDTDP Collaboratory Data Site System Security " : [ " SCDTDP " , " NICHQ SCDTDP " , " NICHQ security " ] ,
2021-06-02 18:16:48 +00:00
" Project GUTS/TWIG/Making Sense of Models " : [ " Teachers with GUTS " , " TWIG " , " GUTS " , " Project GUTS " ] ,
2021-06-01 15:06:02 +00:00
" The Propaganda Site " : [ " TPS " , " Propaganda Site " , " The Propganda Site " , " Murat & Clay " ] ,
2022-02-12 03:05:29 +00:00
" Therapy Fidelity App - Development " : [ " Tulane " , " Therapy Fidelity App " ] ,
2022-03-25 20:37:18 +00:00
" Type Network " : [ " TypeNetwork " , " TN " , " Tyye Network " ] ,
2021-06-02 17:12:09 +00:00
" VHFA " : [ " Vermont Housing Finance Agency " , " Vermont " , " Vermont Housing " ] ,
" Vulk redesign " : [ " Vulk " , " Vulk.coop " ] ,
2022-01-29 04:52:58 +00:00
" Visions Unite " : [ " VU " , " VisionsUnite " ] ,
2021-10-01 20:35:11 +00:00
" C-Team support " : [ " ZEIT " , " ZEIT ONLINE " , " ZEIT Upgrade " , " Zeit D9 " ] ,
2021-04-28 15:14:39 +00:00
}
2021-06-02 00:33:23 +00:00
other_project_names = {
2021-06-18 21:12:21 +00:00
" Near North camp " : [ " Near North Camp " , " Near North defense " , " Encampment support " , " Camp support " , " NN camp defense " , " NN camp " , " NN defense " , " Near North camp defense " , " Camp " , " Near North " ] ,
2021-06-02 00:33:23 +00:00
" Personal " : [ " Personal/external " , " Personal / external " , " External " ] ,
2021-06-02 17:12:09 +00:00
" Tzedakah " : [ " Community support " ] ,
2021-06-07 23:51:12 +00:00
" PWGD " : [ " People Who Give a Damn " , " PWGD Inc " ] ,
2021-06-18 21:15:02 +00:00
" Workers Defense Alliance " : [ " WDA " , " Alliance " , " Twin Cities Workers Defense Alliance " ] ,
2022-03-25 20:38:16 +00:00
" Solidarity Network " : [ " SolNet " ] ,
2021-06-02 00:33:23 +00:00
}
2021-06-02 00:48:57 +00:00
replacement_project_names = harvest_project_names | other_project_names
2021-06-02 00:33:23 +00:00
2021-04-28 15:14:39 +00:00
for preferred , alternatives in replacement_project_names . items ( ) :
2021-05-28 18:39:51 +00:00
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives . append ( preferred )
alternatives = [ item . lower ( ) for item in alternatives ]
timelog . loc [ timelog . project . str . lower ( ) . isin ( alternatives ) , " project " ] = preferred
2021-04-28 15:14:39 +00:00
2021-06-18 20:28:13 +00:00
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
2021-09-06 15:51:35 +00:00
# Development is the default and never specified.
2021-06-18 20:28:13 +00:00
subproject_names = {
" Contributing back to the community " : [ " contrib " , " contributing " , " contributions " ] ,
" Not billed " : [ " nb " ] ,
" Planning " : [ " plan " , " meeting " , " pm " , " project management " ] ,
2022-03-08 02:41:59 +00:00
" Quality Assurance " : [ " qa " , " quality " ] ,
2021-06-18 20:28:13 +00:00
}
for preferred , alternatives in subproject_names . items ( ) :
alternatives . append ( preferred )
alternatives = [ item . lower ( ) for item in alternatives ]
timelog . loc [ timelog . subproject . str . lower ( ) . isin ( alternatives ) , " subproject " ] = preferred
2021-06-01 20:43:57 +00:00
2021-04-27 23:48:46 +00:00
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
2021-04-28 15:15:53 +00:00
# The fillna is essential or we drop entries with blank ('None') projects.
2021-06-02 14:47:24 +00:00
tl = timelog . groupby ( [ " date " , timelog . project . fillna ( " " ) , timelog . subproject . fillna ( " Development " ) , " description " ] ) . agg ( { " time " : ' sum ' , " started " : ' min ' , " recorded " : ' max ' } ) . reset_index ( )
2021-04-28 15:08:43 +00:00
2021-06-01 15:23:45 +00:00
# We're doing the final conversion to Harvest as a separate step because we
# want to factor out all of the above non-Harvest-specific logic.
latest = tl . recorded . max ( )
2022-01-29 04:51:13 +00:00
datest = tl . date . max ( ) . strftime ( ' % Y- % m- %d ' )
2021-06-01 15:23:45 +00:00
2021-06-02 09:58:42 +00:00
# Separate Harvest from non-Harvest projects, and also filter out any blank
# projects, but save those too for a CSV of the excluded items.
2021-06-02 12:56:01 +00:00
hrvst = tl [ tl . project . isin ( harvest_project_names . keys ( ) ) ]
2021-06-02 00:33:23 +00:00
other = tl [ tl . project . isin ( other_project_names . keys ( ) ) ]
unknown = tl [ ~ tl . project . isin ( replacement_project_names . keys ( ) ) ]
2021-06-02 00:30:30 +00:00
2021-06-02 14:47:54 +00:00
harvest = hrvst . rename ( columns = { ' date ' : ' Date ' , ' project ' : ' Project ' , ' subproject ' : ' Task ' , ' description ' : ' Notes ' } )
2021-06-02 12:56:01 +00:00
harvest [ " Hours " ] = harvest [ " time " ] / 60
2021-06-02 09:58:42 +00:00
harvest [ " First name " ] = " Benjamin "
harvest [ " Last name " ] = " Melançon "
2021-06-02 16:13:31 +00:00
project_client_mapping = settings . harvest_get_projects_clients_map ( )
2021-06-02 14:47:54 +00:00
harvest [ " Client " ] = harvest [ " Project " ] . map ( project_client_mapping )
2021-06-02 12:56:01 +00:00
harvest . drop ( columns = [ ' started ' , ' recorded ' , ' time ' ] , inplace = True )
2021-06-02 00:30:30 +00:00
2021-05-29 08:34:08 +00:00
if not debug :
2021-06-19 01:26:34 +00:00
harvest . to_csv ( ' harvest-timesheets- ' + datest + ' .csv ' , index = False )
other . to_csv ( ' personal-other- ' + datest + ' .csv ' , index = False )
unknown . to_csv ( ' unknown- ' + datest + ' .csv ' , index = False )
2021-06-01 15:23:45 +00:00
settings . pomodoro_latest_recorded ( latest )
2021-05-03 04:01:59 +00:00
else :
2021-06-02 12:56:01 +00:00
hrvst_grouped = hrvst . groupby ( " project " ) . agg ( { " time " : " sum " } ) [ " time " ] / 60
2021-06-02 00:47:42 +00:00
other_grouped = other . groupby ( " project " ) . agg ( { " time " : " sum " } ) [ " time " ] / 60
unknown_grouped = unknown . groupby ( " project " ) . agg ( { " time " : " sum " } ) [ " time " ] / 60
2021-06-02 17:16:09 +00:00
print ( " We do not write to CSV nor update the latest recorded setting when run interactively in the python shell. " )