bufr.pm:bufrextract.pl_source

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
bufr.pm:bufrextract.pl_source [2016-11-29 08:41:07]
pals created
bufr.pm:bufrextract.pl_source [2026-03-26 17:56:54] (current)
pals
Line 1: Line 1:
 <code perl> <code perl>
-#!/usr/bin/perl -w+#!/usr/bin/perl
  
-# (C) Copyright 2010-2016 MET Norway+Copyright (C) 2010-2026 MET Norway
 # #
 # This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
Line 22: Line 22:
  
 use strict; use strict;
 +use warnings;
 use Getopt::Long; use Getopt::Long;
 use Pod::Usage qw(pod2usage); use Pod::Usage qw(pod2usage);
Line 29: Line 30:
 my %option = (); my %option = ();
 GetOptions( GetOptions(
-           \%option, +    \%option, 
-           'ahl=s',        # Extract BUFR messages with AHL matching <ahl_regexp> only +    'ahl=s',           # Extract BUFR messages with AHL matching <ahl_regexp> only 
-           'only_ahl',     Extract AHLs only +    'gts',             Include full gts message envelope if present 
-           'reuse_ahl=i',  Reuse last AHL if current BUFR message has no AHL +    'filter=s',        Extract BUFR messages meeting the <metadata criteria> only 
-           'without_ahl',  # Print the BUFR messages only, skipping AHLs +    'help',            # Print help information and exit 
-           'help',         # Print help information and exit +    'only_ahl',        # Extract AHLs only 
-           'outfile=s',    # Print to file instead of STDOUT +    'outfile=s',       # Print to file instead of STDOUT 
-           'verbose=i',    # Set verbose level to n, 0<=n<=6 (default 0) +    'verbose=i',       # Set verbose level to n, 0<=n<=6 (default 0) 
-       ) or pod2usage(-verbose => 0);+    'without_ahl',     # Print the BUFR messages only, skipping AHLs 
 +    ) or pod2usage(-verbose => 0);
  
 # User asked for help # User asked for help
Line 43: Line 45:
  
 # only_ahl and without_ahl are mutually exclusive # only_ahl and without_ahl are mutually exclusive
-pod2usage( -message => "Options only_ahl and without_ahl are mutually exclusive",+pod2usage( -message => "Options only_ahlwithout_ahl and gts are mutually exclusive",
            -exitval => 2,            -exitval => 2,
            -verbose => 0)            -verbose => 0)
-    if $option{only_ahl} && $option{without_ahl};+    if ( ($option{only_ahl} && ($option{without_ahl} || $option{gts})) 
 +         || ($option{without_ahl} && ($option{only_ahl} || $option{gts})) 
 +         || ($option{gts} && ($option{only_ahl} || $option{without_ahl})) );
  
 # Make sure there is at least one input file # Make sure there is at least one input file
Line 54: Line 58:
 Geo::BUFR->set_verbose($option{verbose}) if $option{verbose}; Geo::BUFR->set_verbose($option{verbose}) if $option{verbose};
  
-Set whether last ahl should be reused if current BUFR message has no AHL +For filtering on ahl
-Geo::BUFR->reuse_current_ahl($option{reuse_ahl}) if defined $option{reuse_ahl}; +
 my $ahl_regexp; my $ahl_regexp;
 if ($option{ahl}) { if ($option{ahl}) {
Line 62: Line 64:
     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;
 } }
 +
 +# For filtering on metadata in section 0/1
 +my $filter = $option{filter} ? $option{filter} : '';
 +my $or_criteria_ref = get_filter_criteria($filter);
  
 # Where to direct output (including verbose output, but not output to STDERR) # Where to direct output (including verbose output, but not output to STDERR)
Line 71: Line 77:
     $OUT = *STDOUT;     $OUT = *STDOUT;
 } }
 +binmode($OUT);
  
 # No need to decode section 4 here # No need to decode section 4 here
Line 78: Line 85:
 foreach my $inputfname ( @ARGV ) { foreach my $inputfname ( @ARGV ) {
     my $bufr = Geo::BUFR->new();     my $bufr = Geo::BUFR->new();
-    $bufr->set_filter_cb(\&filter_on_ahl,$ahl_regexp) if $option{ahl};+ 
 +    # Could alternatively have merged filtering on ahl and metadata into 
 +    # one single callback function, but that would be a rather complex 
 +    # one, so we prefer to do the filtering on metadata later 
 +    $bufr->set_filter_cb(\&filter_on_ahl, $ahl_regexp) if $option{ahl};
  
     # Open BUFR file     # Open BUFR file
Line 122: Line 133:
         }         }
  
-        next if $option{ahl} && $bufr->is_filtered();+ # Filtering on ahl 
 +        next READLOOP if $option{ahl} && $bufr->is_filtered(); 
 + 
 + # Filtering on metadata 
 +        next READLOOP if $or_criteria_ref && not or_filter($bufr, $or_criteria_ref); 
         # Skip messages where stated length of BUFR message is sure to         # Skip messages where stated length of BUFR message is sure to
         # be erroneous, unless we want ahls only (or should we skip         # be erroneous, unless we want ahls only (or should we skip
         # message in this case also? Hard choice...)         # message in this case also? Hard choice...)
-        next if !$option{only_ahl} && $bufr->bad_bufrlength();+        next READLOOP if !$option{only_ahl} && $bufr->bad_bufrlength();
  
         my $current_subset_number = $bufr->get_current_subset_number();         my $current_subset_number = $bufr->get_current_subset_number();
Line 133: Line 149:
         last READLOOP if $current_subset_number == 0;         last READLOOP if $current_subset_number == 0;
  
- $current_message_number = $bufr->get_current_message_number(); +        $current_message_number = $bufr->get_current_message_number(); 
- $current_ahl = $bufr->get_current_ahl() || '';+        $current_ahl = $bufr->get_current_ahl() || ''; 
 +        my $gts_eom = '';
  
- # Must use \r\r\n after AHL, or else BUFR.pm will not +        if ($current_ahl) { 
- # recognize AHL. Should I change this in BUFR.pm? +            if ($option{only_ahl}) { 
- if ($current_ahl && !$bufr->ahl_is_reused()) { +                print $OUT $current_ahl, "\n"; 
-     if ($option{only_ahl}) { +            } elsif (!$option{without_ahl}) { 
- print $OUT $current_ahl, "\n"; +                if ($option{gts}) { 
-     } elsif (!$option{without_ahl}) { +                    my $current_gts_starting_line = $bufr->get_current_gts_starting_line() || ''; 
- print $OUT $current_ahl . "\r\r\n"; +                    print $OUT $current_gts_starting_line; 
-     +                    $gts_eom = $bufr->get_current_gts_eom() || ''; 
-+                } 
- next READLOOP if $option{only_ahl};+                # Use \r\r\n after AHL, since this is the standard 
 +                # sequence used in GTS bulletins 
 +                print $OUT $current_ahl . "\r\r\n"; 
 +            
 +        
 +        next READLOOP if $option{only_ahl};
  
- my $msg = $bufr->get_bufr_message(); +        my $msg = $bufr->get_bufr_message(); 
- print $OUT $msg; +        print $OUT $msg, $gts_eom
-    }+  }
 } }
  
Line 160: Line 182:
     return $ahl =~ $ahl_regexp ? 0 : 1;     return $ahl =~ $ahl_regexp ? 0 : 1;
 } }
 +
 +# Get the list of alternative metadata criteria (these are separated
 +# by '|', see pod)
 +sub get_filter_criteria {
 +    my $filter = shift;
 +    return ('') if ! $filter;
 +
 +    my @or_criteria;
 +    my @criteria = split /[|]/, $filter;
 +    foreach my $cr (@criteria) {
 +        $cr =~ s/^\s+//;
 +        $cr =~ s/\s+$//;
 +        if ($cr ne '') {
 +            push @or_criteria, $cr;
 +        }
 +    }
 +    return \@or_criteria;
 +}
 +
 +# Return true (1) if the BUFR message is matching all @and_criteria
 +# (to be extracted) for at least one of the @or_criteria
 +sub or_filter {
 +    my ($bufr, $or_criteria_ref) = @_;
 +
 +    my $be = $bufr->get_bufr_edition() || return 0;
 +    my $dc = $bufr->get_data_category();
 +    # Choose to equate data_subcategory with int_data_subcategory, but
 +    # not quite sure about this
 +    my $ic = ($be == 4) ? $bufr->get_int_data_subcategory()
 +                        : $bufr->get_data_subcategory();
 +    my $lc = $bufr->get_loc_data_subcategory();
 +    my $oc = $bufr->get_centre();
 +    my $os = $bufr->get_subcentre();
 +    my $mt = $bufr->get_master_table_version();
 +    my $lt = $bufr->get_local_table_version();
 +    # This will not work for edition 3 when year is before 2000,
 +    # but hard to find a better way...
 +    my $ye = ($be == 4) ? $bufr->get_year()
 +                        : $bufr->get_year_of_century + 2000;
 +    my $mo = $bufr->get_month();
 +    my $da = $bufr->get_day();
 +    my $ho = $bufr->get_hour();
 +    my $mi = $bufr->get_minute();
 +    my $se = ($be == 4) ? $bufr->get_second() : 0;
 +
 +    my $include = 0;
 +  OR:
 +    foreach my $or_criterium (@$or_criteria_ref) {
 +        my $all_ok = 1;
 +        my @and_criteria = split /\s+/, $or_criterium;
 +      AND:
 +        foreach my $and_criterium (@and_criteria) {
 +            my ($c, $list) = split /=/, $and_criterium;
 +            my @list = split /,/, $list;
 +            if ($c eq 'be') {
 +                if (not grep { $_ eq $be } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'dc') {
 +                if (not grep { $_ eq $dc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ic') {
 +                if (not grep { $_ eq $ic } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lc') {
 +                # Not in BUFR edition 3
 +                if (!(defined $lc) || not grep { $_ eq $lc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'oc') {
 +                if (not grep { $_ eq $oc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'os') {
 +                if (not grep { $_ eq $os } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mt') {
 +                if (not grep { $_ eq $mt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lt') {
 +                if (not grep { $_ eq $lt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ye') {
 +                if (not grep { $_ eq $ye } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mo') {
 +                if (not grep { $_ eq $mo } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'da') {
 +                if (not grep { $_ eq $da } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ho') {
 +                if (not grep { $_ eq $ho } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mi') {
 +                if (not grep { $_ eq $mi } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'se') {
 +                if (not grep { $_ eq $se } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            }  else {
 +                die "Metadata '$c' not recognized, check `perldoc bufrextract.pl`"
 +                    . " for the full list of 2-letter abbreviations accepted!";
 +            }
 +      } # end AND
 +        if ($all_ok == 1) {
 +            # BUFR message has met all conditions in this
 +            # or-criterium, so no need to check the others
 +            $include = 1;
 +            last OR;
 +      }
 +
 +  } # end OR
 +
 +    return $include;
 +}
 +
  
  
Line 170: Line 334:
   bufrextract.pl <bufr file(s)>   bufrextract.pl <bufr file(s)>
       [--ahl <ahl_regexp>]       [--ahl <ahl_regexp>]
-      [--only_ahl[--without_ahl+      [--only_ahl | --without_ahl --gts
-      [--reuse_ahl n+      [--filter <metadata criteria>]
-      [--help]+
       [--outfile <filename>]       [--outfile <filename>]
 +      [--help]
       [--verbose n]       [--verbose n]
  
Line 179: Line 343:
  
 Extract all BUFR messages and/or corresponding AHLs from BUFR file(s), Extract all BUFR messages and/or corresponding AHLs from BUFR file(s),
-possibly filtering on AHL.+possibly filtering on AHL and/or metadata in section 1.
  
-The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC DTG +The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC 
-[BBB] immediately preceding the BUFR message.+YYGGgg [BBB] immediately preceding the BUFR message.
  
 Execute without arguments for Usage, with option C<--help> for some Execute without arguments for Usage, with option C<--help> for some
Line 193: Line 357:
    --ahl <ahl_regexp> Extract BUFR messages and/or AHLs with AHL    --ahl <ahl_regexp> Extract BUFR messages and/or AHLs with AHL
                       matching <ahl_regexp> only                       matching <ahl_regexp> only
 +   --gts              Include full gts message envelope if present
    --only_ahl         Extract AHLs only    --only_ahl         Extract AHLs only
    --without_ahl      Extract BUFR messages only    --without_ahl      Extract BUFR messages only
-   --reuse_ahl n  n=0 (default) AHL is considered belonging to a BUFR message +   --filter <metadata criteria> 
-                      only if immediately preceding +                      Extract BUFR messages matching the <metadata criteria> only
-                  n=1 When filtering using --ahl: Reuse last AHL found if current +
-                      BUFR message has no immediately preceding AHL +
-   --help             Display Usage and explain the options used. For even +
-                      more info you might prefer to consult perldoc bufrextract.pl+
    --outfile <filename>    --outfile <filename>
                       Will print to <filename> instead of STDOUT                       Will print to <filename> instead of STDOUT
 +   --help             Display Usage and explain the options used. For even
 +                      more info you might prefer to consult perldoc bufrextract.pl
    --verbose n        Set verbose level to n, 0<=n<=6 (default 0)    --verbose n        Set verbose level to n, 0<=n<=6 (default 0)
  
Line 208: Line 371:
  
 For option C<--ahl> the <ahl_regexp> should be a Perl regular For option C<--ahl> the <ahl_regexp> should be a Perl regular
-expression. E.g. C<--ahl 'ISS... ENMI'> will decode only BUFR SHIP+expression. E.g. C<--ahl 'ISS... ENMI'> will decode only SHIP BUFR
 (ISS) from CCCC=ENMI. (ISS) from CCCC=ENMI.
  
-If the BUFR file(sare known to consist solely of GTS bulletinsyou +Use option C<--gts> if you want the full GTS message envelope (if 
-probably should use C<--reuse 1> when applying C<--ahl>, in order to +present) to be included in output. There are 2 main variations on this 
-extract all (and not only the first) BUFR messages in multi message +envelope (SOH/ETX and ZCZC notation)for details see the Manual on 
-bulletins (such bulletins are very rarethough)Note that the +the GTS: Attachment II-4. Format of Meteorological Messages. 
-corresponding AHL is still extracted (and printedonly once.+ 
 +Using C<--filtermakes it possible to filter based on almost any of 
 +the metadata present in section 1 (and 0of the BUFR messages. Some few 
 +examples which hopefully are enough to illustrate how to write the 
 +<metadata criteria>: according to Common Code Table C-13 of 
 +WMO-no. 306"dc=0 ic=0,1,2,6" should take care of synoptic and 
 +one-hour observations from fixed-land stations, while "dc=1 ic=0,6" 
 +should do the same for marine stationsIf you want to extract both, 
 +use for <metadata criteria>: "dc=0 ic=0,1,2,6 | dc=1 ic=0,6"
 + 
 +Here is the full list of metadata available for filtering (the first 
 +2-letter abbreviation is what should be used in the <metadata criteria>): 
 + 
 +  be = BUFR edition 
 +  oc = Originating centre 
 +  os = Originating subcentre 
 +  dc = Data category (table A) 
 +  ic = International data subcategory 
 +  lc = Local data subcategory 
 +  mt = Master table version number 
 +  lt = Local table version number 
 +  ye = Year 
 +  mo = Month 
 +  da = Day 
 +  ho = Hour 
 +  mi = Minute 
 +  se = Second
  
-No bufrtables are needed for running bufrextract.pl, since section 4 +Note that no bufrtables are needed for running bufrextract.pl, since 
-in BUFR message will not be decoded (which also speeds up execution +section 4 in BUFR message will not be decoded (which also speeds up 
-quite a bit).+execution quite a bit).
  
 =head1 HINTS =head1 HINTS
Line 227: Line 416:
 section 0-3, by making your own copy of bufrextract.pl and then section 0-3, by making your own copy of bufrextract.pl and then
 employing one of the many C<get_> subroutines in BUFR.pm. For example, employing one of the many C<get_> subroutines in BUFR.pm. For example,
-to extract only BUFR messages with data category 1, add the following+to extract only BUFR messages with TM315009, add the following
 line just before calling C<is_filtered()> in code: line just before calling C<is_filtered()> in code:
  
-  next if $bufr->get_data_category() != 1;+  next if $bufr->get_descriptors_unexpanded() ne '315009';
  
 =head1 CAVEAT =head1 CAVEAT
  
-bufrextract.pl expects the character sequence CRCRLF (\r\r\n in perl) +Sometimes GTS bulletins are erroneously issued with extra characters 
-between the AHL and the start of the BUFR message. If the file of BUFR +between the GTS AHL and the start of BUFR message (besides the 
-messages has been prepared from a collection of GTS bulletins where +standard character sequence CRCRLF)likely leading bufrextract.pl to 
-this control character sequence has been removedyou should replace +miss the AHL.
-every 'BUFR' in file with '\r\r\nBUFR' before applying bufrextract.pl +
-in order to get the AHLs or filter on AHL.+
  
 =head1 AUTHOR =head1 AUTHOR
Line 247: Line 434:
 =head1 COPYRIGHT =head1 COPYRIGHT
  
-Copyright (C) 2010-2016 MET Norway+Copyright (C) 2010-2026 MET Norway
  
 =cut =cut
 </code> </code>
  • bufr.pm/bufrextract.pl_source.1480408867.txt.gz
  • Last modified: 2022-05-31 09:23:11
  • (external edit)