Extract JSON data with Python
- 1 Minute to read
- Print
- DarkLight
- PDF
Extract JSON data with Python
- 1 Minute to read
- Print
- DarkLight
- PDF
Article summary
Did you find this summary helpful?
Thank you for your feedback
Overview
We have a database with a wide continuous string containing diverse information.
Some of this data is in the form of small JSON strings, we want to extract certain keys from this to create new fields.
Actions
- Make a DataSet field of info records that start with a { character
- Use this as a filter in an Expression that is simply a copy of the info field.
- This reduces the size of the data to discrete - and so we can use the programmed field functions
- Using Programmed Field, execute Python code to extract each key and create a new field.
Example Python Code
import sys
import json
#input and output files from first 2 arguments
inputfile = sys.argv[1]
outputfile = sys.argv[2]
countsfile = sys.argv[3]
#read input file into array
iFile = open(inputfile, 'r')
inputArray = iFile.read().splitlines()
iFile.close()
oFile = open(outputfile, 'w') #write to file
for line in inputArray:
#
try:
#process the lines here
if line.startswith("{") :
#json string in this example has ' replacing "
line = line.replace("'","\"")
parsed_json = json.loads(line)
##extracting the key messageId
processed=str(parsed_json['messageId'])
else:
processed=""
except:
processed="" #assign to null on exception
#endif
#
oFile.write(processed+'\n')
oFile.close()
Video
Example Script
[
{
"method": "SaveDataSetAsField",
"name": "Has Json",
"dataSet": {
"logic": "or",
"name": "HasJson",
"strict": true,
"set": [
{
"logic": "and",
"stype": "FIELD",
"entity": {
"type": "field",
"name": "CompanyAndInfo.info",
"table": "CompanyAndInfo",
"valueFilter": [
]
},
"op": "like",
"values": [
"{*"
]
}
]
},
"project": "ExtractJSON"
},
{
"method": "BuildBakedField",
"name": "Info With JSON",
"project": "ExtractJSON",
"targetTable": "CompanyAndInfo",
"overwrite": true,
"dataType": "default",
"function": "dmath",
"p1": "expression",
"p2": "[info]",
"nullIsZero": false,
"NOORD": false,
"dataSet": {
"logic": "or",
"name": "Has Json 1",
"strict": true,
"set": [
{
"logic": "or",
"stype": "FIELD",
"entity": {
"type": "field",
"name": "CompanyAndInfo.Has Json",
"table": "CompanyAndInfo",
"valueFilter": [
]
},
"op": "=",
"values": [
"1"
]
}
]
}
},
{
"method": "BuildBakedField",
"targetTable": "CompanyAndInfo",
"overwrite": true,
"name": "extracted-message",
"function": "PyDiscreteProc",
"source": "Info With JSON",
"code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B276D6573736167654964275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
"project": "ExtractJSON"
},
{
"method": "BuildBakedField",
"targetTable": "CompanyAndInfo",
"overwrite": true,
"name": "extracted-date",
"function": "PyDiscreteProc",
"source": "Info With JSON",
"code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B2764617465275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
"project": "ExtractJSON"
},
{
"method": "BuildBakedField",
"targetTable": "CompanyAndInfo",
"overwrite": true,
"name": "extracted-sale",
"function": "PyDiscreteProc",
"source": "Info With JSON",
"code": "696D706F7274207379730A696D706F7274206A736F6E0A0A23696E70757420616E64206F75747075742066696C65732066726F6D206669727374203220617267756D656E747320200A696E70757466696C65203D207379732E617267765B315D0A6F757470757466696C65203D207379732E617267765B325D0A636F756E747366696C65203D207379732E617267765B335D0A0A237265616420696E7075742066696C6520696E746F2061727261790A6946696C65203D206F70656E28696E70757466696C652C2027722729200A696E7075744172726179203D206946696C652E7265616428292E73706C69746C696E65732829200A6946696C652E636C6F736528290A0A202020200A0A6F46696C65203D206F70656E286F757470757466696C652C20277727292023777269746520746F2066696C650A666F72206C696E6520696E20696E70757441727261793A0A230A202020207472793A0A20202020202020202370726F6365737320746865206C696E657320686572650A20202020202020206966206C696E652E7374617274737769746828227B2229203A0A202020202020202020202020236A736F6E20737472696E6720696E2074686973206578616D706C65206861732027207265706C6163696E6720220A2020202020202020202020206C696E65203D206C696E652E7265706C616365282227222C225C2222290A2020202020202020202020207061727365645F6A736F6E203D206A736F6E2E6C6F616473286C696E65290A202020202020202020202020232365787472616374696E6720746865206B6579206D65737361676549640A20202020202020202020202070726F6365737365643D737472287061727365645F6A736F6E5B2773616C65275D290A2020202020202020656C73653A0A20202020202020202020202070726F6365737365643D22220A20202020202020200A20202020202020200A202020206578636570743A0A202020202020202070726F6365737365643D2222202361737369676E20746F206E756C6C206F6E20657863657074696F6E0A23656E646966202020200A230A202020206F46696C652E77726974652870726F6365737365642B275C6E2729200A6F46696C652E636C6F736528290A2020202020202020202020202020202020202020200A",
"project": "ExtractJSON"
}
]
Was this article helpful?