### Load packages  
# Standard
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)

# Network specific
library(tidygraph) # For tidy-style graph manipulation
library(ggraph) # For ggplot2 style graph plotting

# EconGeo Specific
library(EconGeo) # Economc Geography functions

A closer look at the Chinese patenting activity

Loading and understanding the data structure

So, let’s start the fun. I for you extracted Chinese patents from our EPO PATSTAT databases filed at either the EPO or the USTPO. I further provide you adittional data. Lets take a look:

patents <- readRDS("data/CN_patent.rds")
inventors <- readRDS("data/CN_inventor.rds") %>% filter(str_detect(city, "china")) %>% mutate(city = city %>% str_remove("\\,.*")) 
el_pat_inv <- readRDS("data/CN_el_inventor_patent.rds") 
el_pat_tech <- readRDS("data/CN_el_patent_field.rds")
field_names <- read_csv("data/ipc_technology.csv") %>%
  select(field_nr, sector, field_name) %>%
  distinct(field_nr, .keep_all = TRUE) %>%
  mutate(field_nr = field_nr) %>%
  arrange(field_nr)
cities_geo <- inventors %>%
  distinct(city, .keep_all = TRUE) %>%
  select(city, lon, lat)
reg_pat <- el_pat_inv %>%
  arrange(appln_id, invt_seq_nr) %>%
  distinct(appln_id, .keep_all = TRUE) %>%
  left_join(inventors %>% select(person_id, city), by = "person_id") %>%
  left_join(patents %>% select(appln_id, appln_filing_year, nb_citing_docdb_fam), by = "appln_id") %>%
  select(-invt_seq_nr, person_id) 

Economic complexity of Chinese patenting activity

Let’s not take a look at the complexity of Chinese patenting activity.

We will here use some neath functions from theR package EconGeo. Its only on github, so we have to install it from there.

#devtools::install_github("PABalland/EconGeo", force = T)
library(EconGeo)

Data munging

First, here we will need to create some big matrices, therefore I will here already define a little helper function which takes an edgelist as input and produces really efficient a sparse M[i,j] 2-mode matrix.

## Helper function
create_sparse_matrix <- function(i.input, j.input){
  require(Matrix)
  mat <- spMatrix(nrow = i.input %>% n_distinct(),
                  ncol = j.input %>% n_distinct(),
                  i = i.input %>% factor() %>% as.numeric(),
                  j = j.input %>% factor() %>% as.numeric(),
                  x = rep(1, i.input %>% length() ) )
  
  row.names(mat) <- i.input %>% factor() %>% levels()
  colnames(mat) <- j.input %>% factor() %>% levels()
  return(mat)
}

Now, we will create this sparse matrix between patents and their associated technology classes.

mat_tech <- create_sparse_matrix(i = el_pat_tech %>% pull(appln_id),
                                 j = el_pat_tech %>% pull(techn_field_nr))

Reminding basic matrix algebra, a 2-mode matrix M[i,j] can be transformed to a 1-mode adjacency matrix M[j,j] by taking the dot-product (matrix multiplication) of its transposed version t(M[i,j]), and into a 1-mode adjacency matrix M[i,i] by taking the dot-product with itself. In summary: M[i,j] %*% M[i,j] = M[j,j] and M[i,j] %*% t(M[i,j]) = M[i,i].

We can do that very efficiently with the crossprod() and tcrossprod() function of the Matrix package.

mat_tech %<>% 
  crossprod() %>% 
  as.matrix() 
#mat.pat <- mat.tech %>% tcrossprod() %>% as.matrix() # Would create a patent matrix based on shared IPC classes (Not a good idea here)
mat_tech
##       1    2    3     4    5     6   7    8    9   10  11   12   13   14   15
## 1  9416  909  213    63  209   476   8  411  646  390  15  487   79   43    4
## 2   909 8670  629   572  193  2957  45  555  936  224   4  137   39    4    1
## 3   213  629 7477  4345  289  1083  80   27  174  274   3  134   20    3    0
## 4    63  572 4345 14724  303  2785 271   10   23  354   0  153   12    3    1
## 5   209  193  289   303 1817   418   3   50   15   96   0   90    5    3    0
## 6   476 2957 1083  2785  418 15567 497  255  430  597  20  479  147   19   22
## 7     8   45   80   271    3   497 747    0    2   30   2   85    7    1    0
## 8   411  555   27    10   50   255   0 4826  732  151   5   22    9   90    4
## 9   646  936  174    23   15   430   2  732 4246  195   3   24   48   22    3
## 10  390  224  274   354   96   597  30  151  195 3722 202  227  198   41   82
## 11   15    4    3     0    0    20   2    5    3  202 457    9   22   74  201
## 12  487  137  134   153   90   479  85   22   24  227   9 1824   44    4    2
## 13   79   39   20    12    5   147   7    9   48  198  22   44 1488   24   50
## 14   43    4    3     3    3    19   1   90   22   41  74    4   24 2852  346
## 15    4    1    0     1    0    22   0    4    3   82 201    2   50  346 1420
## 16    4    1    0     0    0     6   0    2    0   15  96    1   77 1283  623
## 17  101   26    0     0    0     6   0   72   62   15   9    2   28  180   31
## 18    5    0    0     0    0     3   0    0    1    3   3    2    7   64  119
## 19  116   28    0     0    1     9   1  143  114   27  22    2   28  390  104
## 20  183   26   10     0    0    12   0   71   31   15   5    4   17   59    6
## 21  220  129   29     1    7    59   1  221  162   40   7    6   29   30    9
## 22  133   39    6     1    1    14   0  156   52   58  18    0   22   17   26
## 23  136   33    4     2    0    12   0   46   34  138  65   20   64  284   65
## 24   23    9    1     0    3    11   0   20    3   56   6    6   49   19   19
## 25   55   65   20     1    1    55   2   36   31   60   4   76   36    3    3
## 26   91  113   15     0    1    33   1   59   64   59   1   19   25    5    3
## 27  145   72    4     1    1    51   5   21   21   77   4   30   58    3    1
## 28   28   23   18     2    4    41   0   19   25   14   3    2   13   16   15
## 29  120   67   22     2    1    27   4   33   76   19   5    9   49   25   31
## 30   96  320    7     1    2    59   0  161   16   28   3   33   22    6    1
## 31  114  222   40     3    3   125   1   22   46   45   5   55   43    2    1
## 32  172   33   15    15    2    63   3    9   16   70   4   84   15    0    0
## 33   94  267   35    26    3   166   4   13   23   32   0   39   54    1    0
## 34  112  122   33    10   16    97   1   34   32   39   0   34  107    5    1
## 35   68  139   56     6    2   122   0    3    6   56   3   46   11    2    0
##      16   17  18   19   20   21  22   23  24   25   26  27  28   29   30   31
## 1     4  101   5  116  183  220 133  136  23   55   91 145  28  120   96  114
## 2     1   26   0   28   26  129  39   33   9   65  113  72  23   67  320  222
## 3     0    0   0    0   10   29   6    4   1   20   15   4  18   22    7   40
## 4     0    0   0    0    0    1   1    2   0    1    0   1   2    2    1    3
## 5     0    0   0    1    0    7   1    0   3    1    1   1   4    1    2    3
## 6     6    6   3    9   12   59  14   12  11   55   33  51  41   27   59  125
## 7     0    0   0    1    0    1   0    0   0    2    1   5   0    4    0    1
## 8     2   72   0  143   71  221 156   46  20   36   59  21  19   33  161   22
## 9     0   62   1  114   31  162  52   34   3   31   64  21  25   76   16   46
## 10   15   15   3   27   15   40  58  138  56   60   59  77  14   19   28   45
## 11   96    9   3   22    5    7  18   65   6    4    1   4   3    5    3    5
## 12    1    2   2    2    4    6   0   20   6   76   19  30   2    9   33   55
## 13   77   28   7   28   17   29  22   64  49   36   25  58  13   49   22   43
## 14 1283  180  64  390   59   30  17  284  19    3    5   3  16   25    6    2
## 15  623   31 119  104    6    9  26   65  19    3    3   1  15   31    1    1
## 16 2290   66  73  202   15   14  18   19   5    2    1   1   3   29    1    2
## 17   66 1316   6  367   60  126  33  123  12    6    4   3  55  243    1    7
## 18   73    6 310   29    3    3   0   20   3   10    2   0   4   40    3    1
## 19  202  367  29 1661  161  158  46  220  48   10   23  13  45   80   27   10
## 20   15   60   3  161 1135  232 111  214  76    9   59  15  43   67   38   15
## 21   14  126   3  158  232 1541 107  112  15   52  106  18  85  222   19   24
## 22   18   33   0   46  111  107 554   43   5    2    8   2  34   34    3    2
## 23   19  123  20  220  214  112  43 1711 251   53   28  45  60   92   53   67
## 24    5   12   3   48   76   15   5  251 653   14   13  31   2   20   32   14
## 25    2    6  10   10    9   52   2   53  14 1150   86   7  32   57    7  113
## 26    1    4   2   23   59  106   8   28  13   86 1368  22  17   63   57   96
## 27    1    3   0   13   15   18   2   45  31    7   22 911   0   14   77   89
## 28    3   55   4   45   43   85  34   60   2   32   17   0 565   59    2    8
## 29   29  243  40   80   67  222  34   92  20   57   63  14  59 1102   14   24
## 30    1    1   3   27   38   19   3   53  32    7   57  77   2   14 1003   46
## 31    2    7   1   10   15   24   2   67  14  113   96  89   8   24   46 1512
## 32    2   17   0    6    6   17   1   14  13   66   28  51   4   31   18  131
## 33    2    9  18    6    4   14   2   56   9   96   43  17   8   44   36  196
## 34    2   15   0   12   17   69   9   41   5   58   28  14  67   41   68   43
## 35    0   14   1   24   15   33   3   59  27   61   37  33   3   44   20  183
##      32   33   34   35
## 1   172   94  112   68
## 2    33  267  122  139
## 3    15   35   33   56
## 4    15   26   10    6
## 5     2    3   16    2
## 6    63  166   97  122
## 7     3    4    1    0
## 8     9   13   34    3
## 9    16   23   32    6
## 10   70   32   39   56
## 11    4    0    0    3
## 12   84   39   34   46
## 13   15   54  107   11
## 14    0    1    5    2
## 15    0    0    1    0
## 16    2    2    2    0
## 17   17    9   15   14
## 18    0   18    0    1
## 19    6    6   12   24
## 20    6    4   17   15
## 21   17   14   69   33
## 22    1    2    9    3
## 23   14   56   41   59
## 24   13    9    5   27
## 25   66   96   58   61
## 26   28   43   28   37
## 27   51   17   14   33
## 28    4    8   67    3
## 29   31   44   41   44
## 30   18   36   68   20
## 31  131  196   43  183
## 32 1122   67   23   54
## 33   67 1561   86  130
## 34   23   86 1048   35
## 35   54  130   35 1330

This matrix can be used for the relatedness() function to transform the co-occurence adjacency matrix to a similarity matrix. This function computes the relatedness (Hidalgo et al., 2007; Boschma et al., 2015; Balland, 2016) between entities (industries, technologies,…) from their co-occurrence (adjacency) matrix. Different normalization procedures are proposed following van Eck and Waltman (2009): association strength, cosine, Jaccard, and an adapted version of the association strength (probability index). I here choose the cosine similarity, since it is not so sensitive to scale.

mat_tech_rel <- mat_tech %>% 
  relatedness(method = "cosine")

Lets check what happened?

mat_tech_rel[1:10,1:10]
##              1          2           3           4           5          6
## 1  0.000000000 0.12444247 0.031647291 0.008611730 0.065189691 0.05833133
## 2  0.124442471 0.00000000 0.076368437 0.063892840 0.049192202 0.29610968
## 3  0.031647291 0.07636844 0.000000000 0.526742229 0.079944574 0.11770148
## 4  0.008611730 0.06389284 0.526742229 0.000000000 0.077112838 0.27846562
## 5  0.065189691 0.04919220 0.079944574 0.077112838 0.000000000 0.09536861
## 6  0.058331334 0.29610968 0.117701481 0.278465617 0.095368611 0.00000000
## 7  0.003189473 0.01466046 0.028286382 0.088155542 0.002226816 0.14493786
## 8  0.090412297 0.09976656 0.005267541 0.001794887 0.020478111 0.04103197
## 9  0.131096548 0.15521758 0.031316016 0.003808360 0.005667405 0.06382984
## 10 0.080553306 0.03780708 0.050191230 0.059658653 0.036916814 0.09019649
##               7           8            9         10
## 1  0.0031894727 0.090412297 0.1310965481 0.08055331
## 2  0.0146604648 0.099766557 0.1552175800 0.03780708
## 3  0.0282863822 0.005267541 0.0313160155 0.05019123
## 4  0.0881555416 0.001794887 0.0038083600 0.05965865
## 5  0.0022268162 0.020478111 0.0056674045 0.03691681
## 6  0.1449378553 0.041031973 0.0638298363 0.09019649
## 7  0.0000000000 0.000000000 0.0009658712 0.01474587
## 8  0.0000000000 0.000000000 0.1950549954 0.04095274
## 9  0.0009658712 0.195054995 0.0000000000 0.04878808
## 10 0.0147458733 0.040952740 0.0487880757 0.00000000

Now, at the row-column intercept, we will not have the number of co-occurences, but the relatedness index instead.

The Chinese Technology Space

With this matrix, we can now create the Chinese “Technology Space” a’la Hidalgo & Hausmann (2007). Here we create a network of technology fields. I will delete below average weights to create some sparsity, and calculate the eigenvector centrality. I will here use the Eigenvector centrality as an easy proxi for technological complexity.

library(tidygraph)

g_tech <- mat_tech_rel %>% as_tbl_graph(directed = FALSE) %N>%
  left_join(field_names %>% mutate(field_nr = field_nr %>% as.character()), by = c("name" = "field_nr")) %>%
  mutate(dgr = centrality_eigen(weights = weight)) %E>%
  filter(weight >= mean(weight))
g_tech 
## # A tbl_graph: 35 nodes and 177 edges
## #
## # An undirected simple graph with 1 component
## #
## # Edge Data: 177 x 3 (active)
##    from    to weight
##   <int> <int>  <dbl>
## 1     1     2 0.124 
## 2     1     3 0.0316
## 3     1     5 0.0652
## 4     1     6 0.0583
## 5     1     8 0.0904
## 6     1     9 0.131 
## # … with 171 more rows
## #
## # Node Data: 35 x 4
##   name  sector                 field_name                                dgr
##   <chr> <chr>                  <chr>                                   <dbl>
## 1 1     Electrical engineering Electrical machinery, apparatus, energy 0.731
## 2 2     Electrical engineering Audio-visual technology                 0.895
## 3 3     Electrical engineering Telecommunications                      0.825
## # … with 32 more rows

We will straight away visualize it as a network. We scale the nodes by their centrality, and color them by their higher level tech-sector.

In order to be able to reproduce the same node position in comparative graphs lateron, we will first save the node layout (classical fruchterman-reingold layout).

coords_tech <- g_tech %>% igraph::layout.fruchterman.reingold() %>% as_tibble()
colnames(coords_tech) <- c("x", "y")
library(ggraph)
g_tech %>%
  ggraph(layout =  coords_tech) + 
  geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") + 
  geom_node_point(aes(colour = sector, size = dgr)) + 
  geom_node_text(aes(label = field_name), size = 4, repel = TRUE) +
  theme_graph()

Lets see which tech-field is the most central (complex) one:

g_tech %N>% 
  arrange(desc(dgr)) %>%
  as_tibble() %>%
  slice(1:10)

And the least one…

g_tech %N>% 
  arrange(dgr) %>%
  as_tibble() %>%
  slice(1:10)

Optimally, one would have created this space on world-data, and not only Chinese patents, though. Let’s, now that we know the overall space, take a look at city level specialization and complexity.

Specialization pattern of cities

We here first join our city level patent data with the technology fields. Since a patent can have multiple ones, we this time fractionalize it with a field_weight.

reg_tech <- reg_pat %>%
  left_join(el_pat_tech %>% select(appln_id, techn_field_nr), by = "appln_id") %>%
  group_by(appln_id) %>%
  mutate(field_weight = 1 / n()) %>%
  ungroup()

Now we first aggregate it on the whole time frame. We could for sure also do that sepperate for different years. We filtr our cities with

reg_tech %<>%
  group_by(city, techn_field_nr) %>%
  summarise(n_tech_reg = sum(field_weight)) %>%
  ungroup() %>%
  drop_na() 

Now, we for the EconGeo package again have to transform it to a matrix. Again, we have to set the rownames, which are depreciated in the tibble

mat_reg_tech <- reg_tech %>%
  arrange(techn_field_nr, city) %>%
  pivot_wider(names_from = techn_field_nr, values_from = n_tech_reg, values_fill = list(n_tech_reg = 0))

rownames(mat_reg_tech) <- mat_reg_tech %>% pull(city)

mat_reg_tech %<>% select(-city) %>%
  as.matrix() %>%
  round()

Lets take a look:

mat_reg_tech[1:10, 1:10]
##             1    2    3    4   5    6   7    8    9  10
## baicheng    1    0    0    0   0    0   0    0    0   0
## baoding     1    0    0    0   1    0   0    1    2   3
## baoji       3    0    0    0   0    1   0    0    0   0
## baotou      1    0    0    0   0    1   0    0    0   0
## beijing   802 1452 1202 3114 230 4807 152 1519 1143 666
## cangzhou    0    0    0    0   0    0   0    0    0   0
## changchun   9    0    0    0   0    1   0   10    3   7
## changsha   10    1    1    2   0    9   0    1    2   8
## changshu    5    4    1    0   0    1   0    1    1   0
## changzhou  20    1    0    4   0    5   0    2    1   8

Now, we can use the location.quotient() function from EconGeo. This function computes location quotients from (incidence) regions - industries matrices. The numerator is the share of a given industry in a given region. The denominator is the share of a this industry in a larger economy (overall country for instance). This index is also referred to as the index of Revealed Comparative Advantage (RCA) following Ballasa (1965), or the Hoover-Balassa index. To make it simple, we create a binary one, only indicating if the city has a RCA in the associated field or not. Afterwards, we transform it back to a tibble, set rownames again in a column, and gather it from a long to wide format.

reg_RCA <- mat_reg_tech %>% location.quotient(binary = TRUE) %>% 
  as.data.frame() %>% 
  rownames_to_column("city") %>% 
  as_tibble() %>% 
  gather(key = "tech_field", value = "RCA", -city) %>%
  arrange(city, tech_field)
reg_RCA %>% head()

The Herfindahl() function computes the Herfindahl index from regions - industries matrices from (incidence) regions - industries matrices. It is a measure of concentration. This index is also known as the Herfindahl-Hirschman index (Herfindahl, 1959; Hirschman, 1945).

mat_reg_tech %>% 
  Herfindahl() %>% 
  as.data.frame() %>% 
  rownames_to_column("city") %>% 
  rename(HH = ".") %>% 
  arrange(desc(HH)) %>% 
  head(10)

Next, we can compute the Shannon entropy index (Shannon and Weaver, 1949; Frenken et al., 2007) from regions - industries matrices from (incidence) regions - industries matrices.

mat_reg_tech %>% 
  entropy() %>% 
  as.data.frame() %>% 
  rownames_to_column("city") %>% 
  rename(SE = ".") %>% 
  arrange(desc(SE)) %>%
  head(10)

Now, finally, we can inspect where in the technology space different cities are specialized. We select some of interest.

city_select <- c("beijing", "shanghai", "shenzhen")

In the following, I plot the technology-space and highlights in red the fields where the city has an RCA.

Beijing

i = 1
g_tech %N>%
  left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
  ggraph(layout = coords_tech) + 
  geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") + 
  geom_node_point(aes(colour = RCA, size = dgr)) + 
  geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
  scale_color_gradient(low = "skyblue", high = "red") +
  theme_graph() +
  ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))

Shanghai

i = 2
g_tech %N>%
  left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
  ggraph(layout = coords_tech) + 
  geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") + 
  geom_node_point(aes(colour = RCA, size = dgr)) + 
  geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
  scale_color_gradient(low = "skyblue", high = "red") +
  theme_graph() +
  ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))

Shenzhen

i = 3
g_tech %N>%
  left_join(reg_RCA %>% filter(city == city_select[i]) %>% select(-city), by = c("name" = "tech_field")) %>%
  ggraph(layout = coords_tech) + 
  geom_edge_link(aes(width = weight), alpha = 0.2, colour = "grey") + 
  geom_node_point(aes(colour = RCA, size = dgr)) + 
  geom_node_text(aes(label = field_name), size = 5, repel = TRUE) +
  scale_color_gradient(low = "skyblue", high = "red") +
  theme_graph() +
  ggtitle(paste("Technology Space: RCA", city_select[i], sep = " "))

So, what differences do we see?

Endnotes

References

Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] Matrix_1.2-17   EconGeo_1.3     ggraph_2.0.2    tidygraph_1.1.2
##  [5] magrittr_1.5    forcats_0.5.0   stringr_1.4.0   dplyr_0.8.5    
##  [9] purrr_0.3.3     readr_1.3.1     tidyr_1.0.2     tibble_2.1.3   
## [13] ggplot2_3.3.0   tidyverse_1.3.0 knitr_1.28     
## 
## loaded via a namespace (and not attached):
##  [1] ggrepel_0.8.2      Rcpp_1.0.4         lubridate_1.7.4    lattice_0.20-38   
##  [5] utf8_1.1.4         assertthat_0.2.1   digest_0.6.25      ggforce_0.3.1     
##  [9] R6_2.4.1           cellranger_1.1.0   backports_1.1.5    reprex_0.3.0      
## [13] evaluate_0.14      httr_1.4.1         pillar_1.4.3       rlang_0.4.5       
## [17] readxl_1.3.1       rstudioapi_0.11    rmarkdown_2.1      labeling_0.3      
## [21] igraph_1.2.5       polyclip_1.10-0    munsell_0.5.0      broom_0.5.5       
## [25] compiler_3.6.0     modelr_0.1.6       xfun_0.12          pkgconfig_2.0.3   
## [29] htmltools_0.4.0    tidyselect_1.0.0   gridExtra_2.3      graphlayouts_0.6.0
## [33] viridisLite_0.3.0  fansi_0.4.1        crayon_1.3.4       dbplyr_1.4.2      
## [37] withr_2.1.2        MASS_7.3-51.4      grid_3.6.0         nlme_3.1-139      
## [41] jsonlite_1.6.1     gtable_0.3.0       lifecycle_0.2.0    DBI_1.1.0         
## [45] scales_1.1.0       cli_2.0.2          stringi_1.4.6      farver_2.0.3      
## [49] viridis_0.5.1      fs_1.3.2           xml2_1.2.5         ellipsis_0.3.0    
## [53] generics_0.0.2     vctrs_0.2.4        tools_3.6.0        glue_1.3.2        
## [57] tweenr_1.0.1       hms_0.5.3          yaml_2.2.1         colorspace_1.4-1  
## [61] rvest_0.3.5        haven_2.2.0