Hi,
I haven't been able to stream pig data to a command line script, can someone
help out?
I want to execute a command line script called GMTFilter (all stdin, stdout,
and stderr work) from a pig script. I tried the following approaches, and
none worked…. The 2d and 3rd method works if they're called directly from
shell, but fail if run in a pig script with stream. Please see the attached
error at the end. I only attached the error log for the third approach since
I think it has the highest chance of being right.
Thanks in advance!
Alex
1) Directly in pig script
stream data through `command -options`;
2) With bash
#!/usr/bin/env bash
command -options
And in pig
define filter `GMTFilter.sh`
input (stdin using PigStreaming(','))
ouptut(stdout using PigStreaming(','))
ship('GMTFilter.sh');
stream data through filter;
3) With Python
import sys
import subprocess
def main():
for line in sys.stdin:
proc = subprocess.Popen("command
-options",shell=True,stdin=subprocess.PIPE, stdout=subprocess.PIPE)
output, error = proc.communicate('%s,%s\n'%(lon,lat))
output = ouput.strip()
output = output.split(',')
sys.stdout.write(lon+'\t'+lat+'\n')
if __name__=='__main__':
main()
And in pig
define filter `GMTFilter.py`
input (stdin using PigStreaming(','))
ouptut(stdout using PigStreaming(','))
ship('GMTFilter.py');
stream data through filter;
#####################################################################################################################
MapAttempt TASK_TYPE="MAP" TASKID="task_201009211320_0201_m_000076"
TASK_ATTEMPT_ID="attempt_201009211320_0201_m_000076_0" TASK_STATUS="KILLED"
FINISH_TIME="1287601082247" HOSTNAME="n0003" ERROR="" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_0"
START_TIME="1287600127497"
TRACKER_NAME="tracker_n0004:localhost\.localdomain/127\.0\.0\.1:51031"
HTTP_PORT="50060" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_0" TASK_STATUS="FAILED"
FINISH_TIME="1287601092279" HOSTNAME="n0004"
ERROR="org\.apache\.pig\.backend\.executionengine\.ExecException: ERROR
2090: Received Error while processing the reduce plan: 'GMTFilter\.py '
failed with exit status: 1
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.runPipeline(PigMapReduce\.java:467)
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.cleanup(PigMapReduce\.java:493)
at org\.apache\.hadoop\.mapreduce\.Reducer\.run(Reducer\.java:178)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.runNewReducer(ReduceTask\.java:566)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.run(ReduceTask\.java:408)
at org\.apache\.hadoop\.mapred\.Child\.main(Child\.java:170)
" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_1"
START_TIME="1287601095958"
TRACKER_NAME="tracker_n0003:localhost\.localdomain/127\.0\.0\.1:37197"
HTTP_PORT="50060" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_1" TASK_STATUS="FAILED"
FINISH_TIME="1287601107494" HOSTNAME="n0003"
ERROR="org\.apache\.pig\.backend\.executionengine\.ExecException: ERROR
2090: Received Error while processing the reduce plan: 'GMTFilter\.py '
failed with exit status: 1
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.runPipeline(PigMapReduce\.java:467)
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.cleanup(PigMapReduce\.java:493)
at org\.apache\.hadoop\.mapreduce\.Reducer\.run(Reducer\.java:178)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.runNewReducer(ReduceTask\.java:566)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.run(ReduceTask\.java:408)
at org\.apache\.hadoop\.mapred\.Child\.main(Child\.java:170)
" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_2"
START_TIME="1287601107970"
TRACKER_NAME="tracker_n0003:localhost\.localdomain/127\.0\.0\.1:37197"
HTTP_PORT="50060" .
ReduceAttempt TASK_TYPE="REDUCE" TASKID="task_201009211320_0201_r_000000"
TASK_ATTEMPT_ID="attempt_201009211320_0201_r_000000_2" TASK_STATUS="FAILED"
FINISH_TIME="1287601119092" HOSTNAME="n0003"
ERROR="org\.apache\.pig\.backend\.executionengine\.ExecException: ERROR
2090: Received Error while processing the reduce plan: 'GMTFilter\.py '
failed with exit status: 1
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.runPipeline(PigMapReduce\.java:467)
at
org\.apache\.pig\.backend\.hadoop\.executionengine\.mapReduceLayer\.PigMapReduce$Reduce\.cleanup(PigMapReduce\.java:493)
at org\.apache\.hadoop\.mapreduce\.Reducer\.run(Reducer\.java:178)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.runNewReducer(ReduceTask\.java:566)
at
org\.apache\.hadoop\.mapred\.ReduceTask\.run(ReduceTask\.java:408)
at org\.apache\.hadoop\.mapred\.Child\.main(Child\.java:170)
" .