2021-04-27 13:41:44 +00:00
import pandas as pd
2021-04-28 01:45:37 +00:00
import numpy as np
2021-05-25 19:57:52 +00:00
import glob
2021-05-03 02:38:54 +00:00
import re
2021-05-03 04:04:18 +00:00
import sys
# Import our local settings management.
2021-05-02 22:39:37 +00:00
import settings
2021-04-27 13:41:44 +00:00
2021-05-28 18:00:36 +00:00
if hasattr ( sys , ' ps1 ' ) :
2021-05-27 12:19:56 +00:00
import copy
debug = True
else :
debug = False
2021-05-28 18:21:52 +00:00
if settings . pomodoro_logfile ( ) :
# This works for one file:
timelog = pd . read_csv ( settings . pomodoro_logfile ( ) )
else :
# For multiple files:
path = settings . pomodoro_logpath ( )
all_files = glob . glob ( path + " *.csv " )
li = [ ]
for filename in all_files :
df = pd . read_csv ( filename , index_col = None , header = 0 )
li . append ( df )
timelog = pd . concat ( li , axis = 0 , ignore_index = True )
2021-05-25 19:57:52 +00:00
2021-05-27 12:19:56 +00:00
if debug :
imported = copy . deepcopy ( timelog )
2021-05-25 19:57:52 +00:00
2021-05-26 16:48:36 +00:00
timelog . drop_duplicates ( inplace = True )
2021-04-27 13:41:44 +00:00
2021-05-27 12:19:56 +00:00
if debug :
nodupes = copy . deepcopy ( timelog )
2021-04-28 03:37:37 +00:00
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
2021-05-27 12:19:56 +00:00
# We can allow no data in the 'intention' so define the three columns to check:
timelog = timelog . dropna ( subset = [ ' started ' , ' recorded ' , ' description ' ] )
2021-04-28 03:37:37 +00:00
timelog = timelog . reset_index ( drop = True )
2021-05-27 12:19:56 +00:00
if debug :
dropna = copy . deepcopy ( timelog )
# For debugging, keep originals around.
timelog [ " orig_desc " ] = timelog [ " description " ]
timelog [ " orig_started " ] = timelog [ " started " ]
timelog [ " orig_recorded " ] = timelog [ " recorded " ]
2021-04-28 04:14:48 +00:00
# Clean up description before we go to work on it.
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 04:16:15 +00:00
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
2021-04-28 04:14:48 +00:00
timelog [ " description " ] = list ( timelog [ " description " ] . str . split ( " ; " ) )
timelog = timelog . explode ( " description " ) . reset_index ( )
2021-05-27 12:19:56 +00:00
if debug :
mess = copy . deepcopy ( timelog )
2021-05-28 18:24:07 +00:00
timelog [ " started " ] = pd . to_datetime ( timelog [ " started " ] , errors = ' coerce ' ) . dt . tz_convert ( " US/Eastern " )
2021-04-27 15:30:53 +00:00
timelog [ " recorded " ] = pd . to_datetime ( timelog [ " recorded " ] ) . dt . tz_convert ( " US/Eastern " )
2021-05-03 04:01:59 +00:00
latest_recorded = settings . pomodoro_latest_recorded ( )
if latest_recorded :
2021-05-03 04:13:07 +00:00
timelog = timelog [ timelog . recorded > pd . to_datetime ( latest_recorded ) ]
2021-05-03 04:01:59 +00:00
2021-04-27 15:06:20 +00:00
timelog [ " time " ] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog [ " date " ] = timelog [ " started " ] . dt . tz_convert ( " US/Pacific " ) . dt . date
2021-04-27 15:31:27 +00:00
timelog [ " day_of_week " ] = pd . to_datetime ( timelog [ " date " ] ) . dt . day_name ( )
2021-04-27 15:32:34 +00:00
2021-04-28 02:30:07 +00:00
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
2021-04-28 01:45:37 +00:00
timelog [ ' project ' ] = ( np . where ( timelog [ ' description ' ] . str . contains ( ' : ' ) , timelog [ ' description ' ] . str . split ( ' : ' , 1 ) . str [ 0 ] , None ) )
timelog [ ' description ' ] = ( np . where ( timelog [ ' description ' ] . str . contains ( ' : ' ) , timelog [ ' description ' ] . str . split ( ' : ' , 1 ) . str [ 1 ] , timelog [ ' description ' ] ) )
2021-04-28 15:05:02 +00:00
# Mid-work clean up of description and new project.
2021-04-28 02:40:15 +00:00
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 15:05:02 +00:00
timelog [ ' project ' ] = timelog [ ' project ' ] . str . strip ( )
2021-04-28 02:40:15 +00:00
2021-04-28 02:30:38 +00:00
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
2021-04-28 03:37:37 +00:00
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
2021-05-03 02:38:54 +00:00
p = re . compile ( r ' \ * \ s* \ d$ ' )
# On some systems, using np.where worked but others failed. Why it worked is
# unknown but why it failed is because numpy where evaluates all parts, even
# the parts that will never get used because the where clause does not apply!
# This caused the chained strings to fail because— no string.
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog [ ' tmp_multiplier ' ] = timelog [ ' description ' ] . apply ( lambda x : x . rsplit ( ' * ' , 1 ) [ 1 ] . strip ( ) if p . search ( x ) else 1 )
timelog [ ' description ' ] = timelog [ ' description ' ] . apply ( lambda x : x . rsplit ( ' * ' , 1 ) [ 0 ] if p . search ( x ) else x )
2021-04-28 03:37:37 +00:00
timelog [ " time " ] = timelog [ " time " ] * timelog [ ' tmp_multiplier ' ] . astype ( int )
2021-04-28 03:51:26 +00:00
timelog . drop ( columns = [ ' tmp_multiplier ' ] , inplace = True )
2021-04-28 03:37:37 +00:00
# Clean up description again, after it has been sliced and diced.
2021-04-28 02:40:15 +00:00
timelog [ ' description ' ] = timelog [ ' description ' ] . str . strip ( )
2021-04-28 01:45:37 +00:00
2021-04-28 15:14:39 +00:00
# Replace irregular-but-known project names with ones timetracking tools use.
replacement_project_names = {
2021-05-28 18:38:09 +00:00
" Family & Home " : [ " Family and Home " , " Family home " ] ,
2021-04-28 15:14:39 +00:00
" Find It Cambridge " : [ " Find It " , " FIC " , " Cambridge " ] ,
2021-05-28 18:38:47 +00:00
" NICHQ Support: FL CMS LAN " : [ " FL CMS LAN " , " flcmslan " ] ,
2021-05-29 08:33:46 +00:00
" The Propaganda Site " : [ " TPS " , " Propaganda Site " , " The Propganda Site " , " Murat & Clay " ] ,
2021-04-28 15:14:39 +00:00
" MASS Design Group " : [ " MASS " ] ,
" Teachers with GUTS " : [ " TWIG " , " GUTS " ] ,
2021-04-30 13:19:10 +00:00
" NICHQ Support " : [ " NICHQ " , " NICHQ support " , " nichq " ] ,
2021-04-28 15:14:39 +00:00
" Network engagement " : [ " Network Engagement " , " network engagement " , " Network engagment " , " Social media " , " Network building " , " Agaric network engagement " ] ,
" Agaric internal " : [ " Agaric " , " Internal " ] ,
2021-05-29 08:33:46 +00:00
" Agaric contrib " : [ " Contributing " , " Contrib " , " Agaric contributions " ] ,
2021-04-28 15:14:39 +00:00
" Leads " : [ " Lead " , " Agaric leads " , " Lead followups " ] ,
" Learning " : [ " Personal learning " ] ,
2021-04-28 15:42:00 +00:00
" Drutopia " : [ " Drutopia improvements " ] ,
2021-04-28 15:14:39 +00:00
" Personal / external " : [ " Personal/external " , " Personal " , " External " ] ,
2021-05-28 18:39:09 +00:00
" Near North camp " : [ " Near North Camp " , " Near North defense " , " Encampment support " , " Camp support " , " NN camp defense " , " NN camp " , " NN defense " , " Near North camp defense " ] ,
2021-04-28 15:14:39 +00:00
}
for preferred , alternatives in replacement_project_names . items ( ) :
2021-05-28 18:39:51 +00:00
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives . append ( preferred )
alternatives = [ item . lower ( ) for item in alternatives ]
timelog . loc [ timelog . project . str . lower ( ) . isin ( alternatives ) , " project " ] = preferred
2021-04-28 15:14:39 +00:00
2021-04-27 23:48:46 +00:00
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
2021-04-28 15:15:53 +00:00
# The fillna is essential or we drop entries with blank ('None') projects.
2021-04-28 15:08:43 +00:00
tl = timelog . groupby ( [ " date " , timelog . project . fillna ( " " ) , " description " ] ) . agg ( { " time " : ' sum ' , " started " : ' min ' , " recorded " : ' max ' } ) . reset_index ( )
2021-05-29 08:34:08 +00:00
if not debug :
2021-05-03 04:01:59 +00:00
tl . to_csv ( ' harvest-ready.csv ' , index = False )
settings . pomodoro_latest_recorded ( tl . recorded . max ( ) )
else :
print ( " We do not write to the harvest-ready.csv nor update the latest recorded setting when run interactively in the python shell. " )