Explode data in a list not separated by comma

Question:

Let’s say have the following database:

{'docdb_family_id': {0: 569328,
  1: 574660,
  2: 1187498,
  3: 1226468,
  4: 1236571,
  5: 1239098,
  6: 1239277,
  7: 1239483,
  8: 1239622,
  9: 1239624,
  10: 1239749,
  11: 1334477,
  12: 1340405,
  13: 1340418,
  14: 1340462,
  15: 1340471,
  16: 1340485,
  17: 1340488,
  18: 1340508,
  19: 1340519,
  20: 1340541},
 'newa_cited_docdb': {0: '[  596005  4321416  5802640  6031690  6043910  8600475  8642629  9203255  9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
  1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
  2: '[ 5855196  7755392 11183886 22894980 24648618 27185399]',
  3: '[ 3573464  6279285  6294985  6542463  6981930  7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
  4: '[ 2777078  2826073  5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
  5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
  6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
  7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
  8: '[ 3770167  9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
  9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
  10: '[ 6245732  6270984  6282047  6313094  6323632  6357314 12700997 14934415]',
  11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
  12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
  13: '[ 4096597  6452385  9164095 19820980 22468583 23758517 24922228]',
  14: '[ 6273193  6365448  9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
  15: '[ 3770297  6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
  16: '[21808322 25167492 25401922 26858065]',
  17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
  18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
  19: '[ 3029629  3461742  8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
  20: None},
 'paperid': {0: nan,
  1: nan,
  2: nan,
  3: nan,
  4: nan,
  5: nan,
  6: nan,
  7: nan,
  8: nan,
  9: nan,
  10: nan,
  11: nan,
  12: nan,
  13: nan,
  14: nan,
  15: nan,
  16: nan,
  17: nan,
  18: nan,
  19: nan,
  20: 1998988989.0},
 'fronteer': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 0,
  13: 0,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 0,
  20: 1},
 'distance': {0: nan,
  1: nan,
  2: nan,
  3: nan,
  4: nan,
  5: nan,
  6: nan,
  7: nan,
  8: nan,
  9: nan,
  10: nan,
  11: nan,
  12: nan,
  13: nan,
  14: nan,
  15: nan,
  16: nan,
  17: nan,
  18: nan,
  19: nan,
  20: 0.0},
 'cited_docdb_ls': {0: '[  596005  4321416  5802640  6031690  6043910  8600475  8642629  9203255  9345445 10177065 10455451 13428248 22139349 22591458 24627241 24750476 26261826 26405611 27079105 27096884]',
  1: '[ 5956195 11260528 22181831 22437920 22642946 23278096 23407037 23458128 24244657 24355363 25014714 25115774 25156886 27047688 27089078 27398716]',
  2: '[ 5855196  7755392 11183886 22894980 24648618 27185399]',
  3: '[ 3573464  6279285  6294985  6542463  6981930  7427770 10325811 14970234 16878329 17935009 21811002 22329817 23543436 23907898 24456108 25283772]',
  4: '[ 2777078  2826073  5944733 10484188 11052747 14682645 15688752 22333410 22614097 22646501 22783765 22978728 23231683 24259740 24605606 24839432 25492752 27009992 27044704]',
  5: '[ 5777407 10417156 23463145 23845079 24397163 24426379 24916732 25216234 25296619 27054560 27509152]',
  6: '[ 4136523 12578497 21994155 22418792 22626616 22655464 22694825 22779403 23081767 23309829 23379411 23621952 24130698 24236071 24267003 24790872 24841797 25343500 27006578]',
  7: '[21722194 23841261 23870348 24749080 26713455 26884023 26892256 27123571]',
  8: '[ 3770167  9249538 20340153 21805004 21826650 23074051 23211424 23586695 23664858 24139881 24669345 24951262 25109266 25172355 25351735 26158421 27074633]',
  9: '[ 3773931 10400885 23825854 24863945 24904226 25372210 26673422 27108903]',
  10: '[ 6245732  6270984  6282047  6313094  6323632  6357314 12700997 14934415]',
  11: '[1331950 5937719 5950928 6032897 6737094 8103287]',
  12: '[22536768 23111794 23827356 24148953 24483064 24636228 26369896 26722884]',
  13: '[ 4096597  6452385  9164095 19820980 22468583 23758517 24922228]',
  14: '[ 6273193  6365448  9349940 10531948 13589721 20897840 21818345 22422049 23234586 23722349 24282964 24466601 25476838 26223504 26685774 26756449 26812104 26900843 27088150]',
  15: '[ 3770297  6285357 21272262 21883292 22392025 23100861 23160290 23827496 24060758 25448672 26918320]',
  16: '[21808322 25167492 25401922 26858065]',
  17: '[ 6293130 12621423 12977043 14043576 14524083 22013480 23070753 23360636 23672818 24210016 24396413 24505095 25447453 26335550 27560125]',
  18: '[21923978 23414619 23700077 23916998 23917011 23917023 24227869]',
  19: '[ 3029629  3461742  8589904 10338953 10633369 16254362 22248316 22635394 24392987 25416705 26671842 27391491 27406148]',
  20: []}}

what I would like to do is to explode the variable cited_docdb_ls which contains lists separated by space rather than a comma.
How can I do that? If it is not possible, is there a way to separate them by comma rather than space and then explode them?
The resulting database should either contain cited_docdb_ls with traditional lists separated by comma and not by spaces or the exploded database. I have checked the df.explode() documentation but couldd not find any hint on how to manage situations where the list is separated by space.

Thank you

Asked By: Lusian

||

Answers:

Use Series.str.strip with Series.str.split for both columns and then DataFrame.explode:

df = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.strip('[]').str.split(),
                cited_docdb_ls=df['cited_docdb_ls'].str.strip('[]').str.split())
        .explode(['newa_cited_docdb','cited_docdb_ls']))
               
Answered By: jezrael

I would use str.findall with a (d+) regex for numbers to convert the strings to lists, then explode:

out = (df.assign(newa_cited_docdb=df['newa_cited_docdb'].str.findall('d+'),
                 cited_docdb_ls=df['cited_docdb_ls'].str.findall('(d+)'))
         .explode(['newa_cited_docdb', 'cited_docdb_ls'])
     )

NB. if you don’t have only digits a (w+) regex will be more generic, however if the strings also contain [/] other than the in first and last character (e.g. [abc 12]3 45d]), then @jezrael’s anwser will be an alternative.

output:

    docdb_family_id newa_cited_docdb       paperid  fronteer  distance  
0            569328           596005           NaN         0       NaN   
0            569328          4321416           NaN         0       NaN   
0            569328          5802640           NaN         0       NaN   
0            569328          6031690           NaN         0       NaN   
0            569328          6043910           NaN         0       NaN   
..              ...              ...           ...       ...       ...   
19          1340519         25416705           NaN         0       NaN   
19          1340519         26671842           NaN         0       NaN   
19          1340519         27391491           NaN         0       NaN   
19          1340519         27406148           NaN         0       NaN   
20          1340541             None  1.998989e+09         1       0.0   

   cited_docdb_ls  
0          596005  
0         4321416  
0         5802640  
0         6031690  
0         6043910  
..            ...  
19       25416705  
19       26671842  
19       27391491  
19       27406148  
20            NaN  

[239 rows x 6 columns]
Answered By: mozway
Categories: questions Tags: , , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.