12. Counting new vs recurring authors, for a specific journal
Level: Advanced
This query requires a good understanding of SQL and the Dimensions data model
Description
This query evaluates a single journal (specified by journal ID - e.g. jour.1115214) and counts the number of authors per year that it has published. Authors are split into two categories: new authors who have never appeared in the journal before, and "recurring" authors who have already written at least one previous article in the journal.
Note
Because the query starts in 2011, some "recurring" authors are never counted as new authors: If an author publishes one paper each in, say, 2007 and 2014, they will appear as a recurring author in 2014, but the year in which they would be a "new" author is not displayed.
Query
WITH authoryear AS (
-- how many papers has each individual researcher published in
-- the specified journal? Broken down by year.
SELECT
pubs.year,
author.researcher_id,
COUNT(pubs.id) AS numpubs
FROM
`dimensions-ai.data_analytics.publications` AS pubs
CROSS JOIN UNNEST(pubs.authors) AS author
WHERE
author.researcher_id IS NOT NULL
AND journal.id="jour.1115214" -- Nature BioTechnology
GROUP BY
author.researcher_id, pubs.year
),
authorfirst AS (
-- For each author, what year is their FIRST publication in
-- the specified journal?
SELECT researcher_id, MIN(year) AS minyear
FROM authoryear
GROUP BY researcher_id
),
authorsummary AS (
-- Modify the author-level list of publications per
-- year by adding a new field, "firstyear", that indicates
-- whether this is the year in which they are "new."
SELECT
ay.*,
IF(ay.year=af.minyear, TRUE, FALSE) AS firstyear
FROM authoryear ay
INNER JOIN authorfirst af
ON af.researcher_id=ay.researcher_id
),
numauthors AS (
-- For each year, total up the new and recurring authors
SELECT year, firstyear,
COUNT(DISTINCT researcher_id) AS numresearchers
FROM authorsummary
WHERE year>2010
GROUP BY year, firstyear
)
-- Finally, we rearrange the "numauthors" subquery so
-- each year in the specified range only has a SINGLE ROW,
-- indicating both the new and recurring authors.
SELECT
year,
SUM(
CASE
WHEN firstyear
THEN numresearchers
ELSE 0
END
) AS num_first,
SUM(
CASE
WHEN NOT firstyear
THEN numresearchers
ELSE 0
END
) AS num_recurring
FROM numauthors
GROUP BY year
ORDER BY year
Results
[
{
"year": "2011",
"num_first": "1041",
"num_recurring": "352"
},
{
"year": "2012",
"num_first": "859",
"num_recurring": "374"
},
{
"year": "2013",
"num_first": "927",
"num_recurring": "347"
},
{
"year": "2014",
"num_first": "1088",
"num_recurring": "338"
},
{
"year": "2015",
"num_first": "1044",
"num_recurring": "392"
},
{
"year": "2016",
"num_first": "1319",
"num_recurring": "350"
},
{
"year": "2017",
"num_first": "1074",
"num_recurring": "404"
},
{
"year": "2018",
"num_first": "1111",
"num_recurring": "419"
},
{
"year": "2019",
"num_first": "1219",
"num_recurring": "447"
},
{
"year": "2020",
"num_first": "1611",
"num_recurring": "570"
},
{
"year": "2021",
"num_first": "411",
"num_recurring": "189"
}
]