Extract JSON data with Python
  • 1 Minute to read
  • Dark
    Light
  • PDF

Extract JSON data with Python

  • Dark
    Light
  • PDF

Article summary

Overview

We have a database with a wide continuous string containing diverse information.

Some of this data is in the form of small JSON strings, we want to extract certain keys from this to create new fields.

Actions

  • Make a DataSet field of info records that start with a { character
  • Use this as a filter in an Expression that is simply a copy of the info field. 
  • This reduces the size of the data to discrete - and so we can use the programmed field functions
  • Using Programmed Field, execute Python code to extract each key and create a new field.

Example Python Code

import sys
import json

#input and output files from first 2 arguments  
inputfile = sys.argv[1]
outputfile = sys.argv[2]
countsfile = sys.argv[3]

#read input file into array
iFile = open(inputfile, 'r') 
inputArray = iFile.read().splitlines() 
iFile.close()

    

oFile = open(outputfile, 'w') #write to file
for line in inputArray:
#
    try:
        #process the lines here
        if line.startswith("{") :
            #json string in this example has ' replacing "
            line = line.replace("'","\"")
            parsed_json = json.loads(line)
            ##extracting the key messageId
            processed=str(parsed_json['messageId'])
        else:
            processed=""
        
        
    except:
        processed="" #assign to null on exception
#endif    
#
    oFile.write(processed+'\n') 
oFile.close()
                     


Video


Example Script

[
  {
    "method": "SaveDataSetAsField",
    "name": "Has Json",
    "dataSet": {
      "logic": "or",
      "name": "HasJson",
      "strict": true,
      "set": [
        {
          "logic": "and",
          "stype": "FIELD",
          "entity": {
            "type": "field",
            "name": "CompanyAndInfo.info",
            "table": "CompanyAndInfo",
            "valueFilter": [
            ]
          },
          "op": "like",
          "values": [
            "{*"
          ]
        }
      ]
    },
    "project": "ExtractJSON"
  },
  {
    "method": "BuildBakedField",
    "name": "Info With JSON",
    "project": "ExtractJSON",
    "targetTable": "CompanyAndInfo",
    "overwrite": true,
    "dataType": "default",
    "function": "dmath",
    "p1": "expression",
    "p2": "[info]",
    "nullIsZero": false,
    "NOORD": false,
    "dataSet": {
      "logic": "or",
      "name": "Has Json 1",
      "strict": true,
      "set": [
        {
          "logic": "or",
          "stype": "FIELD",
          "entity": {
            "type": "field",
            "name": "CompanyAndInfo.Has Json",
            "table": "CompanyAndInfo",
            "valueFilter": [
            ]
          },
          "op": "=",
          "values": [
            "1"
          ]
        }
      ]
    }
  },
  {
    "method": "BuildBakedField",
    "targetTable": "CompanyAndInfo",
    "overwrite": true,
    "name": "extracted-message",
    "function": "PyDiscreteProc",
    "source": "Info With JSON",
    "code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B276D6573736167654964275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
    "project": "ExtractJSON"
  },
  {
    "method": "BuildBakedField",
    "targetTable": "CompanyAndInfo",
    "overwrite": true,
    "name": "extracted-date",
    "function": "PyDiscreteProc",
    "source": "Info With JSON",
    "code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B2764617465275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
    "project": "ExtractJSON"
  },
  {
    "method": "BuildBakedField",
    "targetTable": "CompanyAndInfo",
    "overwrite": true,
    "name": "extracted-sale",
    "function": "PyDiscreteProc",
    "source": "Info With JSON",
    "code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B2773616C65275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
    "project": "ExtractJSON"
  }
]



Was this article helpful?

What's Next