BASIC PROJECT ANALYSIS#

import pandas as pd

## to avoid warnings of deprecated functions merging multiple levels
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

df = pd.read_csv("../outputs/citations.tsv", \
                 sep="\t", header=None)
df.columns = ['nsfid', 'doi', 'cites']

df.doi = df.doi.str.lower()
df_clean = df.drop_duplicates()
df_clean = df_clean.query('cites>-1')

PUBLICATION COUNTS BY PROJECT#

df_clean

	nsfid	doi	cites
0	1324760	10.1016/j.geomorph.2015.03.039	19
2	1324760	10.2110/sedred.2013.4.9	2
3	1324760	10.2110/sedred.2013.4	0
4	1324760	10.1029/2021ef002088	0
5	1324760	10.1029/2017jf004576	10
...	...	...	...
356	1639694	10.1016/j.dib.2022.107824	2
357	1639694	10.1016/j.cageo.2019.07.003	2
358	1440221	10.1007/978-3-319-33245-1	2
359	1541028	10.3897/biss.2.26644	0
360	1541028	10.1130/abs/2017am-298208	0

309 rows × 3 columns

df_nsf_normed = pd.read_csv("../outputs/nsf/nsfid_project_title_normed.csv")
df_nsf_normed.columns = ['nsfid', 'title_normed']

df_project_counts = \
    df_clean \
    .groupby('nsfid') \
    .agg(doi_count=('doi','count'),  \
         total_cites=('cites','sum')) \
    .reset_index() \
    .merge(df_nsf_normed, how='left', on='nsfid')

df_project_counts.sort_values('doi_count', ascending=False)[:10]

	nsfid	doi_count	total_cites	title_normed
24	1440323	16	262	Earthcube Building Blocks: Collaborative Propo...
21	1440294	13	108	Earthcube Building Blocks: Cyberconnector: Bri...
43	1541043	11	29	Earthcube Ia: Collaborative Proposal: Optimal ...
80	1927578	10	42	Earthcube Data Capabilities: Machine Learning ...
76	1740693	8	188	Earthcube Integration: Cyberway--Integrated Ca...
87	1928406	8	34	Collaborative Research: Earthcube Data Capabil...
28	1540542	8	20	Earthcube Ia: Collaborative Proposal: Optimal ...
69	1639759	7	45	Earthcube Building Blocks: Collaborative Propo...
86	1928403	7	30	Earthcube Data Capabilities: Collaborative Pro...
8	1343811	6	57	Earthcube Building Blocks: Earth System Bridge...

df_project_counts.sort_values('total_cites', ascending=False)[:10]

	nsfid	doi_count	total_cites	title_normed
46	1541049	6	556	Earthcube Ia: Collaborative Proposal: Earthcub...
42	1541039	6	556	Earthcube Ia: Collaborative Proposal: Earthcub...
34	1540998	5	546	Earthcube Ia: Collaborative Proposal: Earthcub...
48	1541390	3	400	Earthcube Rcn: Collaborative Research: Engagin...
53	1639588	6	319	Collaborative Proposal: Earthcube Building Blo...
24	1440323	16	262	Earthcube Building Blocks: Collaborative Propo...
32	1540979	3	246	Earthcube Ia: Collaborative Proposal: Building...
40	1541029	4	217	Earthcube Ia: Collaborative Proposal: Linkedea...
33	1540996	1	192	Earthcube Ia: Collaborative Proposal: Linkedea...
76	1740693	8	188	Earthcube Integration: Cyberway--Integrated Ca...

df_project_counts['title_normed'].value_counts()[:3]

Earthcube Data Infrastructure: Collaborative Proposal: A Unified Experimental-Natural Digital Data System For Analysis Of Rock Microstructures    5
Earthcube Building Blocks: Collaborative Proposal: Cloud-Hosted Real-Time Data Services For The Geosciences (Chords)                              4
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment                                                                    3
Name: title_normed, dtype: int64

df_clean.drop_duplicates().merge(df_nsf_normed)\
    .drop('nsfid', axis=1).drop_duplicates().sort_values('title_normed')

	doi	cites	title_normed
171	10.1093/nar/gkaa637	8	Collaborative Proposal: Earthcube Building Blo...
169	10.1038/nbt.4306	247	Collaborative Proposal: Earthcube Building Blo...
172	10.1093/gigascience/giy165	23	Collaborative Proposal: Earthcube Building Blo...
173	10.1093/gigascience/giz083	17	Collaborative Proposal: Earthcube Building Blo...
174	10.3389/fmicb.2021.765268	1	Collaborative Proposal: Earthcube Building Blo...
...	...	...	...
19	10.2166/hydro.2015.331	7	Title: Earthcube Building Blocks: Integrating ...
20	10.1111/1752-1688.12436	20	Title: Earthcube Building Blocks: Integrating ...
21	10.1111/1752-1688.12474	77	Title: Earthcube Building Blocks: Integrating ...
17	10.1111/1752-1688.12387	5	Title: Earthcube Building Blocks: Integrating ...
22	10.1111/1752-1688.12437	1	Title: Earthcube Building Blocks: Integrating ...

268 rows × 3 columns

df_projects_tmp_a = \
    df_clean.drop_duplicates().merge(df_nsf_normed) \
        .drop('nsfid', axis=1).drop_duplicates().sort_values('title_normed') \
        .groupby('title_normed') \
        .agg(project_dois=('doi', 'count'),
             project_cites=('cites', 'sum'))

df_projects_tmp_b = \
    df_clean.drop_duplicates().merge(df_nsf_normed) \
        .groupby('title_normed') \
        .agg(project_ids=('nsfid', lambda d: d.unique().tolist()),
             project_counts=('nsfid', lambda d: len(d.unique().tolist())))

df_projects_summary = df_projects_tmp_b.merge(df_projects_tmp_a, left_index=True, right_index=True)
df_projects_summary['dois_per_collab'] = df_projects_summary['project_dois'] / df_projects_summary['project_counts']
df_projects_summary['cites_per_collab'] = df_projects_summary['project_cites'] / df_projects_summary['project_counts']
df_projects_summary['cites_per_doi'] = df_projects_summary['project_cites'] / df_projects_summary['project_dois']

df_projects_summary.sort_values(by='project_cites', ascending=False)

	project_ids	project_counts	project_dois	project_cites	dois_per_collab	cites_per_collab	cites_per_doi
title_normed
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	[1541039, 1540998, 1541049]	3	6	556	2.000000	185.333333	92.666667
Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network	[1541390]	1	3	400	3.000000	400.000000	133.333333
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	[1639588, 1639614]	2	6	319	3.000000	159.500000	53.166667
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences	[1440323, 1440291, 1440332]	3	19	276	6.333333	92.000000	14.526316
Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics	[1540979, 1541002]	2	9	259	4.500000	129.500000	28.777778
...	...	...	...	...	...	...	...
Earthcube Ia: Oceans Of Data: Bringing Earthcube To The Science User	[1540966]	1	1	0	1.000000	0.000000	0.000000
Collaborative Research: Earthcube Data Capabilities: Volcanology Hub For Interdisciplinary Collaboration, Tools And Resources (Victor)	[2125974]	1	1	0	1.000000	0.000000	0.000000
Earthcube Building Blocks: Collaborative Proposal: Polar Data Insights And Search Analytics For The Deep And Scientific Web	[1639675]	1	1	0	1.000000	0.000000	0.000000
Earthcube Ia: Collaborative Proposal: Enhancing Paleontological And Neontological Data Discovery Api	[1541028]	1	2	0	2.000000	0.000000	0.000000
Earthcube Rcn: An Earthcube Oceanography And Geobiology Environmental Omics Research Coordination Network (Ecogeo Rcn)	[1440066]	1	1	0	1.000000	0.000000	0.000000

61 rows × 7 columns

df_projects_summary.sort_values(by='project_cites', ascending=False)

	project_ids	project_counts	project_dois	project_cites	dois_per_collab	cites_per_collab	cites_per_doi
title_normed
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	[1541039, 1540998, 1541049]	3	6	556	2.000000	185.333333	92.666667
Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network	[1541390]	1	3	400	3.000000	400.000000	133.333333
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	[1639588, 1639614]	2	6	319	3.000000	159.500000	53.166667
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences	[1440323, 1440291, 1440332]	3	19	276	6.333333	92.000000	14.526316
Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics	[1540979, 1541002]	2	9	259	4.500000	129.500000	28.777778
...	...	...	...	...	...	...	...
Earthcube Ia: Oceans Of Data: Bringing Earthcube To The Science User	[1540966]	1	1	0	1.000000	0.000000	0.000000
Collaborative Research: Earthcube Data Capabilities: Volcanology Hub For Interdisciplinary Collaboration, Tools And Resources (Victor)	[2125974]	1	1	0	1.000000	0.000000	0.000000
Earthcube Building Blocks: Collaborative Proposal: Polar Data Insights And Search Analytics For The Deep And Scientific Web	[1639675]	1	1	0	1.000000	0.000000	0.000000
Earthcube Ia: Collaborative Proposal: Enhancing Paleontological And Neontological Data Discovery Api	[1541028]	1	2	0	2.000000	0.000000	0.000000
Earthcube Rcn: An Earthcube Oceanography And Geobiology Environmental Omics Research Coordination Network (Ecogeo Rcn)	[1440066]	1	1	0	1.000000	0.000000	0.000000

61 rows × 7 columns

df_projects_summary.to_csv("../outputs/publication_project_summary.csv")
df_projects_summary.to_json("../outputs/publication_project_summary.json")

df_projects_summary.columns

Index(['project_ids', 'project_counts', 'project_dois', 'project_cites',
       'dois_per_collab', 'cites_per_collab', 'cites_per_doi'],
      dtype='object')

df_projects_summary[['project_dois']] \
    .sort_values(by='project_dois',ascending=False)[:5]

	project_dois
title_normed
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences	19
Earthcube Building Blocks: Cyberconnector: Bridging The Earth Observations And Earth Science Modeling For Supporting Model Validation, Verification, And Inter-Comparison	13
Earthcube Ia: Collaborative Proposal: Optimal Data Layout For Scalable Geophysical Analysis In A Data-Intensive Environment	11
Earthcube Data Capabilities: Machine Learning Enhanced Cyberinfrastructure For Understanding And Predicting The Onset Of Solar Eruptions	10
Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics	9

df_projects_summary[['project_cites', 'project_counts']] \
    .sort_values(by='project_cites',ascending=False)[:5]

	project_cites	project_counts
title_normed
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	556	3
Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network	400	1
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	319	2
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences	276	3
Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics	259	2

df_projects_summary[['project_dois','cites_per_doi']] \
    .sort_values(by='cites_per_doi',ascending=False)[:5]

	project_dois	cites_per_doi
title_normed
Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network	3	133.333333
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	6	92.666667
Earthcube Ia: Collaborative Proposal: Linkedearth: Crowdsourcing Data Curation & Standards Development In Paleoclimatology	4	54.250000
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	6	53.166667
Earthcube Building Blocks: Collaborative Proposal: The Power Of Many: Ensemble Toolkit For Earth Sciences	6	37.500000

df_projects_summary[['project_counts','project_cites','cites_per_collab']] \
    .sort_values(by='cites_per_collab',ascending=False)[:5]

	project_counts	project_cites	cites_per_collab
title_normed
Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network	1	400	400.000000
Earthcube Integration: Cyberway--Integrated Capabilities Of Earthcube Building Blocks For Facilitating Cyber-Based Innovative Way Of Interdisciplinary Geoscience Studies	1	188	188.000000
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	3	556	185.333333
Earthcube Building Blocks: A Cognitive Computer Infrastructure For Geoscience	1	165	165.000000
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	2	319	159.500000

df_projects_summary[['project_counts','project_cites','cites_per_collab']] \
    .query('project_counts>1') \
    .sort_values(by='cites_per_collab',ascending=False)[:5]

	project_counts	project_cites	cites_per_collab
title_normed
Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment	3	556	185.333333
Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers	2	319	159.500000
Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics	2	259	129.500000
Earthcube Ia: Collaborative Proposal: Linkedearth: Crowdsourcing Data Curation & Standards Development In Paleoclimatology	2	217	108.500000
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences	3	276	92.000000

PUBLICATION COUNTS BY PROJECT TYPE#

change input files JSON#

df_nsf_title_normed = pd.read_csv("../outputs/nsf/nsfid_project_title_normed.csv",).set_index('Unnamed: 0')
df_nsf = pd.read_json("../outputs/nsf/data_full_dump.json").T

df_nsf_normed = df_nsf_title_normed.merge(df_nsf, left_index=True, right_index=True)
del(df_nsf_title_normed)
del(df_nsf)

from datetime import timedelta
df_nsf_normed.startDate = pd.to_datetime(df_nsf_normed.startDate)
df_nsf_normed.expDate = pd.to_datetime(df_nsf_normed.expDate)
df_nsf_normed.fundsObligatedAmt = pd.to_numeric(df_nsf_normed.fundsObligatedAmt)
df_nsf_normed['proj_duration'] = round((df_nsf_normed.expDate - df_nsf_normed.startDate) / timedelta(days=365)).astype(int)

df_nsf_normed.columns

Index(['title_normed', 'abstractText', 'estimatedTotalAmt',
       'fundsObligatedAmt', 'fundProgramName', 'id', 'projectOutComesReport',
       'publicationResearch', 'startDate', 'expDate', 'title', 'awardee',
       'proj_duration'],
      dtype='object')

df_project_summary_full = \
    df_project_counts \
    .set_index('nsfid') \
    .merge(df_nsf_normed, left_index=True, right_index=True)
df_project_summary_full = df_project_summary_full.drop('title_normed_x', axis=1)
df_project_summary_full = df_project_summary_full.rename(columns={'title_normed_y': 'title_normed'})

df_tmp = df_project_summary_full.query('expDate<=2020')[['proj_duration','doi_count','total_cites']]

(df_tmp.doi_count / df_tmp.proj_duration).describe()

count    52.000000
mean      1.079167
std       0.925621
min       0.250000
25%       0.500000
50%       0.708333
75%       1.375000
max       4.333333
dtype: float64

CORRELATION ANALYSIS#

df_project_summary_full.query('expDate<=2020')[['proj_duration','doi_count','total_cites']].corr()

	proj_duration	doi_count	total_cites
proj_duration	1.000000	0.255790	-0.030888
doi_count	0.255790	1.000000	0.370514
total_cites	-0.030888	0.370514	1.000000

NO meaningful correlation between number of papers and project duration

df_project_summary_full.query('expDate<=2020')[['fundsObligatedAmt', 'total_cites']].corr()

	fundsObligatedAmt	total_cites
fundsObligatedAmt	1.000000	-0.028323
total_cites	-0.028323	1.000000

NO correlation between award amount and cites

df_project_summary_full.query('expDate<=2020')[['fundsObligatedAmt', 'doi_count']].corr()

	fundsObligatedAmt	doi_count
fundsObligatedAmt	1.000000	0.249793
doi_count	0.249793	1.000000

NO correlation between number of papers and amount of funding

df_project_summary_full.columns

Index(['doi_count', 'total_cites', 'title_normed', 'abstractText',
       'estimatedTotalAmt', 'fundsObligatedAmt', 'fundProgramName', 'id',
       'projectOutComesReport', 'publicationResearch', 'startDate', 'expDate',
       'title', 'awardee', 'proj_duration'],
      dtype='object')

#     [[('proj_duration','max'), 'project_counts', 'project_cites', 'project_dois']]\

df_project_summary_full[['doi_count', 'total_cites', 'title_normed', 'proj_duration', 'startDate', 'expDate']]\
    .groupby('title_normed')\
    .agg({'doi_count': ['sum'], 'total_cites': ['sum'], 'proj_duration': ['max'], 'startDate': ['min'], 'expDate': ['max']}).sort_values(by=('doi_count', 'sum'))\
    .merge(df_projects_summary, right_index=True, left_on='title_normed')\
    .sort_values('project_dois').sort_values(('proj_duration', 'max'), ascending=False)\
    .rename(columns={
                ('proj_duration', 'max'): 'award_duration', 
                'project_counts': 'collaborators',
                ('doi_count', 'sum'): 'award_publications',
                ('total_cites', 'sum'): 'award_cites',
                'project_counts': 'collaborators',
                'project_id': 'collaborator_nsfid',
                ('startDate', 'min'): 'project_start',
                ('expDate', 'max'): 'project_end',
        })\
    [['award_publications','award_cites','award_duration','project_start','project_end','project_ids','collaborators']]\
    .reset_index().rename(columns={'title_normed': 'project_title'})\
    .to_csv("../outputs/projects_detailed_outcomes.csv", index=False)

df_tmp = \
    df_project_summary_full[['doi_count', 'total_cites', 'title_normed', 'proj_duration', 'startDate', 'expDate']]\
        .groupby('title_normed')\
        .agg({'doi_count': ['sum'], 'total_cites': ['sum'], 'proj_duration': ['max'], 'startDate': ['min'], 'expDate': ['max']}).sort_values(by=('doi_count', 'sum'))\
        .merge(df_projects_summary, right_index=True, left_on='title_normed')\
        .sort_values('project_dois').sort_values(('proj_duration', 'max'), ascending=False)\
        .rename(columns={
                    ('proj_duration', 'max'): 'award_duration', 
                    'project_counts': 'collaborators',
                    'project_counts': 'collaborators',
                    'project_id': 'collaborator_nsfid',
                    ('startDate', 'min'): 'project_start',
                    ('expDate', 'max'): 'project_end',
            })\
#        .reset_index().rename(columns={'title_normed': 'project_title'})

df_tmp = df_tmp.loc[:, ~df_tmp.columns.isin([('doi_count', 'sum'), ('total_cites', 'sum')])]

df_tmp  = df_tmp.iloc[:,:7]
df_tmp.columns = ['Award Duration', 'Award Start', 'Award End', 'Award IDs', 'Collaborators', 'Publication Count', 'Citation Total']
df_tmp[:3]

	Award Duration	Award Start	Award End	Award IDs	Collaborators	Publication Count	Citation Total
title_normed
Earthcube Rcn Is-Geo: Intelligent Systems Research To Support Geosciences	6	2016-08-15	2023-01-31	[1632211]	1	4	78
Collaborative Proposal: Earthcube Integration: Iceberg: Imagery Cyberinfrastructure And Extensible Building-Blocks To Enhance Research In The Geosciences	5	2017-10-01	2022-09-30	[1740595, 1740581]	2	4	14
Earthcube Building Blocks: Collaborative Proposal: The Power Of Many: Ensemble Toolkit For Earth Sciences	5	2016-09-01	2021-08-31	[1639698, 1639707, 1639694]	3	6	225

print("""
| Award Title | Start Date | End Date | Publication Count |
|---:|:--:|:--:|:--:|""")

for r in df_tmp.sort_values('Publication Count', ascending=False)[:5].itertuples():
    print(
        "|",
        r[0], 
        "(" +
        ", ".join([f"[NSF #{d}](https://nsf.gov/awardsearch/showAward?AWD_ID={d}&HistoricalAwards=false)" for d in r[4]])        
        + ") |",
        r[2].date(), "|" ,
        r[3].date(), "|" ,
        r[6], "|"
    )

| Award Title | Start Date | End Date | Publication Count |
|---:|:--:|:--:|:--:|
| Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences ([NSF #1440323](https://nsf.gov/awardsearch/showAward?AWD_ID=1440323&HistoricalAwards=false), [NSF #1440291](https://nsf.gov/awardsearch/showAward?AWD_ID=1440291&HistoricalAwards=false), [NSF #1440332](https://nsf.gov/awardsearch/showAward?AWD_ID=1440332&HistoricalAwards=false)) | 2014-09-01 | 2018-08-31 | 19 |
| Earthcube Building Blocks: Cyberconnector: Bridging The Earth Observations And Earth Science Modeling For Supporting Model Validation, Verification, And Inter-Comparison ([NSF #1440294](https://nsf.gov/awardsearch/showAward?AWD_ID=1440294&HistoricalAwards=false)) | 2014-09-01 | 2017-08-31 | 13 |
| Earthcube Ia: Collaborative Proposal: Optimal Data Layout For Scalable Geophysical Analysis In A Data-Intensive Environment ([NSF #1541043](https://nsf.gov/awardsearch/showAward?AWD_ID=1541043&HistoricalAwards=false), [NSF #1540542](https://nsf.gov/awardsearch/showAward?AWD_ID=1540542&HistoricalAwards=false)) | 2015-09-01 | 2019-08-31 | 11 |
| Earthcube Data Capabilities: Machine Learning Enhanced Cyberinfrastructure For Understanding And Predicting The Onset Of Solar Eruptions ([NSF #1927578](https://nsf.gov/awardsearch/showAward?AWD_ID=1927578&HistoricalAwards=false)) | 2019-09-01 | 2023-08-31 | 10 |
| Earthcube Data Infrastructure: Collaborative Proposal: A Unified Experimental-Natural Digital Data System For Analysis Of Rock Microstructures ([NSF #1639749](https://nsf.gov/awardsearch/showAward?AWD_ID=1639749&HistoricalAwards=false), [NSF #1639710](https://nsf.gov/awardsearch/showAward?AWD_ID=1639710&HistoricalAwards=false), [NSF #1639716](https://nsf.gov/awardsearch/showAward?AWD_ID=1639716&HistoricalAwards=false), [NSF #1639738](https://nsf.gov/awardsearch/showAward?AWD_ID=1639738&HistoricalAwards=false), [NSF #1639748](https://nsf.gov/awardsearch/showAward?AWD_ID=1639748&HistoricalAwards=false)) | 2017-09-01 | 2021-08-31 | 9 |

Award Title	Start Date	End Date	Publication Count
Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences (NSF #1440323, NSF #1440291, NSF #1440332)	2014-09-01	2018-08-31	19
Earthcube Building Blocks: Cyberconnector: Bridging The Earth Observations And Earth Science Modeling For Supporting Model Validation, Verification, And Inter-Comparison (NSF #1440294)	2014-09-01	2017-08-31	13
Earthcube Ia: Collaborative Proposal: Optimal Data Layout For Scalable Geophysical Analysis In A Data-Intensive Environment (NSF #1541043, NSF #1540542)	2015-09-01	2019-08-31	11
Earthcube Data Capabilities: Machine Learning Enhanced Cyberinfrastructure For Understanding And Predicting The Onset Of Solar Eruptions (NSF #1927578)	2019-09-01	2023-08-31	10
Earthcube Data Infrastructure: Collaborative Proposal: A Unified Experimental-Natural Digital Data System For Analysis Of Rock Microstructures (NSF #1639749, NSF #1639710, NSF #1639716, NSF #1639738, NSF #1639748)	2017-09-01	2021-08-31	9

ax = df_tmp['Publication Count'].value_counts().sort_index()\
    .T.plot.bar(ylabel='Number of Projects', xlabel='Publication Count', title='Publication Density to Project Counts')

for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)
    
ax.get_figure().savefig("../outputs/fig_pub_project_density.png", dpi=300)

../../_images/01_basic_project_analysis_44_0.png

print("""
| Award Title | Start Date | End Date | Total Citations |
|---:|:--:|:--:|:--:|""")

for r in df_tmp.sort_values('Citation Total', ascending=False)[:5].itertuples():
    print(
        "|",
        r[0], 
        "(" +
        ", ".join([f"[NSF #{d}](https://nsf.gov/awardsearch/showAward?AWD_ID={d}&HistoricalAwards=false)" for d in r[4]])        
        + ") |",
        r[2].date(), "|" ,
        r[3].date(), "|" ,
        r[7], "|"
    )

| Award Title | Start Date | End Date | Total Citations |
|---:|:--:|:--:|:--:|
| Earthcube Ia: Collaborative Proposal: Earthcube Integration & Test Environment ([NSF #1541039](https://nsf.gov/awardsearch/showAward?AWD_ID=1541039&HistoricalAwards=false), [NSF #1540998](https://nsf.gov/awardsearch/showAward?AWD_ID=1540998&HistoricalAwards=false), [NSF #1541049](https://nsf.gov/awardsearch/showAward?AWD_ID=1541049&HistoricalAwards=false)) | 2015-09-01 | 2018-08-31 | 556 |
| Earthcube Rcn: Collaborative Research: Engaging The Greenland Ice Sheet Ocean (Griso) Science Network ([NSF #1541390](https://nsf.gov/awardsearch/showAward?AWD_ID=1541390&HistoricalAwards=false)) | 2016-08-01 | 2018-07-31 | 400 |
| Collaborative Proposal: Earthcube Building Blocks: Planet Microbe: Enabling The Discovery And Integration Of Oceanographic Omics, Environmental And Physiochemical Data Layers ([NSF #1639588](https://nsf.gov/awardsearch/showAward?AWD_ID=1639588&HistoricalAwards=false), [NSF #1639614](https://nsf.gov/awardsearch/showAward?AWD_ID=1639614&HistoricalAwards=false)) | 2017-09-01 | 2020-08-31 | 319 |
| Earthcube Building Blocks: Collaborative Proposal: Geosoft: Collaborative Open Source Software Sharing For Geosciences ([NSF #1440323](https://nsf.gov/awardsearch/showAward?AWD_ID=1440323&HistoricalAwards=false), [NSF #1440291](https://nsf.gov/awardsearch/showAward?AWD_ID=1440291&HistoricalAwards=false), [NSF #1440332](https://nsf.gov/awardsearch/showAward?AWD_ID=1440332&HistoricalAwards=false)) | 2014-09-01 | 2018-08-31 | 276 |
| Earthcube Ia: Collaborative Proposal: Building Interoperable Cyberinfrastructure (Ci) At The Interface Between Paleogeoinformatics And Bioinformatics ([NSF #1540979](https://nsf.gov/awardsearch/showAward?AWD_ID=1540979&HistoricalAwards=false), [NSF #1541002](https://nsf.gov/awardsearch/showAward?AWD_ID=1541002&HistoricalAwards=false)) | 2015-09-01 | 2019-08-31 | 259 |

df_eco = pd.read_csv("../outputs/eco_funded_project_list.tsv", sep='\t')

df_project_counts

	nsfid	doi_count	total_cites	title_normed
0	1324760	5	31	Rcn: Building A Sediment Experimentalist Netwo...
1	1340233	2	17	Earthcube Test Enterprise Governance: An Agile...
2	1340265	2	6	Ec3 - Earth-Centered Communication For Cyberin...
3	1340301	3	9	Earthcube Rcn: C4P: Collaboration And Cyberinf...
4	1343760	5	165	Earthcube Building Blocks: A Cognitive Compute...
...	...	...	...	...
91	2026951	1	4	Earthcube Capabilities: Cloud-Based Accessible...
92	2125974	1	0	Collaborative Research: Earthcube Data Capabil...
93	2126315	3	17	Earthcube Capabilities: Openmindat - Open Acce...
94	2126449	1	0	Collaborative Research: Earthcube Capabilities...
95	2126474	5	4	Collaborative Research: Earthcube Capabilities...

96 rows × 4 columns

df_eco.merge(df_project_counts)

	nsfid	shortname	program	doi_count	total_cites	title_normed
0	1324760	RCN	Building a Sediment Experimentalist Network (SEN)	5	31	Rcn: Building A Sediment Experimentalist Netwo...
1	1340233	Office	EarthCube Test Enterprise Governance: An Agile...	2	17	Earthcube Test Enterprise Governance: An Agile...
2	1340265	BB	EC3: Earth-Centered Communication for Cyberinf...	2	6	Ec3 - Earth-Centered Communication For Cyberin...
3	1340301	RCN	C4P: Collaboration and Cyberinfrastructure for...	3	9	Earthcube Rcn: C4P: Collaboration And Cyberinf...
4	1343760	BB	A Cognitive Computer Infrastructure for Geosci...	5	165	Earthcube Building Blocks: A Cognitive Compute...
...	...	...	...	...	...	...
91	2026951	DC	Cloud-Based Accessible and Reproducible Modeli...	1	4	Earthcube Capabilities: Cloud-Based Accessible...
92	2126315	DC	OpenMindat - Open Access and Interoperable Min...	3	17	Earthcube Capabilities: Openmindat - Open Acce...
93	2125974	DC	Volcanology hub for Interdisciplinary Collabor...	1	0	Collaborative Research: Earthcube Data Capabil...
94	2126474	DC	ICESpark: An Open-Source Big Data Platform for...	5	4	Collaborative Research: Earthcube Capabilities...
95	2126449	DC	ICESpark: An Open-Source Big Data Platform for...	1	0	Collaborative Research: Earthcube Capabilities...

96 rows × 6 columns

df_eco.merge(df_project_counts).\
    groupby('shortname').agg({'total_cites': 'sum'})

## TODO/NOTE: these are wrong as they include dupes

	total_cites
shortname
BB	2126
DC	226
DI	161
IA	2941
Office	17
RCN	670

PROJECT ANALYSIS#

df_tmp = pd.read_json("../outputs/nsf/data_full_dump.json").T[['startDate','expDate']] 

df_tmp.startDate, df_tmp.expDate = pd.to_datetime(df_tmp.startDate), pd.to_datetime(df_tmp.expDate)
df_tmp.loc[:,'duration'] = \
    df_tmp.expDate.dt.year - df_tmp.startDate.dt.year

ax = \
    df_tmp.duration.value_counts().sort_index()\
        .plot(kind='bar', title='EC Funded Project Durations', ylabel='count (frequency)', xlabel='duration (years)')

for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)
    
ax.get_figure().savefig("../outputs/fig_ec_project_duration.png", dpi=300)

ax = pd.to_datetime(pd.read_json("../outputs/nsf/data_full_dump.json").T.startDate) \
    .dt.year.value_counts() \
    .sort_index() \
    .plot(kind='bar', rot=45, title='NSF Funded EC Projects by Year', ylabel='count of funded projects')\
#    .get_figure().savefig("../outputs/fig_funded_projects_by_year.png", dpi=300)


for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)
    
ax.get_figure().savefig("../outputs/fig_funded_projects_by_year.png", dpi=300)

../../_images/01_basic_project_analysis_52_0.png

df_pubs = pd.read_csv("../outputs/full_nsf_doi_project_summary.tsv",sep='\t').query('cites>-1')\
    .drop_duplicates(subset='doi')
ax = df_pubs.query('year != "None"').year.value_counts().sort_index()\
    .plot.bar(rot=45, title='EC Reported Publication Counts by Year', edgecolor='white', linewidth=2)

for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)
    
ax.get_figure().savefig("../outputs/fig_publications_by_year.png", dpi=300)

../../_images/01_basic_project_analysis_53_0.png

df_pubs

	nsfid	doi	title	ams_bib	cites	year
0	1324760	10.1016/j.geomorph.2015.03.039	Data management, sharing, and reuse in experim...	Hsu, L., R. L. Martin, B. McElroy, K. Litwin-M...	19	2015
2	1324760	10.2110/sedred.2013.4.9	Building a Sediment Experimentalist Network (S...	Hsu, L., B. McElroy, R. L. Martin, and W. Kim,...	2	2013
3	1324760	10.1029/2021ef002088	Earthcasting: Geomorphic Forecasts for Society	Ferdowsi, B., J. D. Gartner, K. N. Johnson, A....	0	2021
4	1324760	10.1029/2017jf004576	Laboratory Investigation on Effects of Flood I...	Miller, K. L., W. Kim, and B. McElroy, 2019: L...	10	2019
5	1340233	10.1126/science.342.6162.1041-b	Open Data: Crediting a Culture of Cooperation	Bolukbasi, B., and Coauthors, 2013: Open Data:...	10	2013
...	...	...	...	...	...	...
354	1639694	10.1016/j.dib.2022.107824	A new hourly dataset for photovoltaic energy p...	Hu, W., G. Cervone, A. Merzky, M. Turilli, and...	2	2022
355	1639694	10.1016/j.cageo.2019.07.003	Dynamically Optimized Unstructured Grid (DOUG)...	Hu, W., and G. Cervone, 2019: Dynamically Opti...	2	2019
356	1440221	10.1007/978-3-319-33245-1	Ontology Engineering	Tamma, V., M. Dragoni, R. Gonçalves, and A. Ła...	2	2016
357	1541028	10.3897/biss.2.26644	The ePANDDA project: linking the Paleobiology ...	Sessa, J., S. Butts, T. Karim, G. Nelson, C. N...	0	2018
358	1541028	10.1130/abs/2017am-298208	THE EPANDDA PROJECT: LINKING THE PALEOBIOLOGY ...	Sessa, J. A., and Coauthors, 2017: THE EPANDDA...	0	2017

244 rows × 6 columns

FUNDED PROJECTS#

df = pd.read_csv("../outputs/citations.tsv", \
                 sep="\t", header=None)

df.columns = ['nsfid', 'doi', 'cites']
df.doi = df.doi.str.lower()

df_clean = df.drop_duplicates(subset='doi')
df_clean = df_clean.query('cites>-1')

df_clean

	nsfid	doi	cites
0	1324760	10.1016/j.geomorph.2015.03.039	19
2	1324760	10.2110/sedred.2013.4.9	2
3	1324760	10.2110/sedred.2013.4	0
4	1324760	10.1029/2021ef002088	0
5	1324760	10.1029/2017jf004576	10
...	...	...	...
356	1639694	10.1016/j.dib.2022.107824	2
357	1639694	10.1016/j.cageo.2019.07.003	2
358	1440221	10.1007/978-3-319-33245-1	2
359	1541028	10.3897/biss.2.26644	0
360	1541028	10.1130/abs/2017am-298208	0

245 rows × 3 columns

df = pd.read_csv("../outputs/full_nsf_doi_project_summary.tsv",sep='\t').query('cites>-1')\
    .drop_duplicates(subset='doi')

set(df_clean.doi).difference(set(df.doi))

{'10.2110/sedred.2013.4'}

# graph
df_pubs = pd.read_csv("../outputs/full_nsf_doi_project_summary.tsv",sep='\t').query('cites>-1')\
    .drop_duplicates(subset='doi')

df_funded = pd.to_datetime(
   pd.read_json("../outputs/nsf/data_full_dump.json").T.startDate
).dt.year

df_funded.value_counts().sort_index()

2013    18
2014    29
2015    41
2016    35
2017    34
2018     1
2019    26
2020    15
2021    16
Name: startDate, dtype: int64

df_pub_counts = pd.DataFrame(df_pubs.query('year != "None"').year.value_counts())
df_pub_counts.index = df_pub_counts.index.astype(int)

df_tmp = pd.concat([df_funded.value_counts(), df_pub_counts], axis=1) 
df_tmp.columns = ['Funded Projects', 'CumPublication Count']
# df_tmp.loc[:,'Cummulative Publication Count'] = df_tmp.iloc[:,1].sort_index().cumsum()

df_tmp.sort_index().cumsum()

	Funded Projects	CumPublication Count
2013	18.0	2
2014	47.0	9
2015	88.0	23
2016	123.0	64
2017	157.0	97
2018	158.0	133
2019	184.0	161
2020	199.0	195
2021	215.0	224
2022	NaN	241
2023	NaN	243

#ax = df_tmp.sort_index().plot(kind='bar')
ax = df_tmp.sort_index().sort_index(ascending=False)\
 .plot.barh(title='EC Funded Projects and Reported Publications by year', edgecolor='white', linewidth=2)


for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)
    
for container in ax.containers:
    ax.bar_label(container,  padding=2, fontsize=6)

ax.get_xaxis().set_visible(False)
ax.get_figure().savefig("../outputs/figh_v4_projects_publications_by_year.png", dpi=300)
### NOTE: THERE IS A DISCREPANCY OF 2 in cummulative sum

../../_images/01_basic_project_analysis_65_0.png

ax = df_tmp.sort_index().cumsum().fillna(df_tmp['Funded Projects'].sum())\
    .plot(title='EC Funded Projects and Reported Publications\n(cummulative by year)', rot=45, kind='bar')

for container in ax.containers:
    ax.bar_label(container,  padding=1.5, fontsize=5)
    
ax.get_figure().savefig("../outputs/fig_cumm_projects_publications_by_year.png", dpi=300)
### NOTE: THERE IS A DISCREPANCY OF 2 in cummulative sum

../../_images/01_basic_project_analysis_66_0.png

EarthCube Program Analysis

BASIC PROJECT ANALYSIS

Contents

BASIC PROJECT ANALYSIS#

PUBLICATION COUNTS BY PROJECT#

PUBLICATION COUNTS BY PROJECT TYPE#

change input files JSON#

CORRELATION ANALYSIS#

PROJECT ANALYSIS#

FUNDED PROJECTS#