@@ -718,13 +718,16 @@ int llama_main(
718718 gpt_vocab vocab,
719719 llama_model model,
720720 int64_t t_load_us,
721- int64_t t_main_start_us) {
721+ int64_t t_main_start_us,
722+ FILE *instream,
723+ FILE *outstream,
724+ FILE *errstream) {
722725
723726 if (params.seed < 0 ) {
724727 params.seed = time (NULL );
725728 }
726729
727- fprintf (stderr , " %s: seed = %d\n " , __func__, params.seed );
730+ fprintf (errstream , " %s: seed = %d\n " , __func__, params.seed );
728731
729732 std::mt19937 rng (params.seed );
730733 if (params.prompt .empty ()) {
@@ -751,13 +754,13 @@ int llama_main(
751754 // tokenize the reverse prompt
752755 std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize (vocab, params.antiprompt , false );
753756
754- fprintf (stderr , " \n " );
755- fprintf (stderr , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
756- fprintf (stderr , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
757+ fprintf (errstream , " \n " );
758+ fprintf (errstream , " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
759+ fprintf (errstream , " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
757760 for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
758- fprintf (stderr , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
761+ fprintf (errstream , " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
759762 }
760- fprintf (stderr , " \n " );
763+ fprintf (errstream , " \n " );
761764 if (params.interactive ) {
762765#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
763766 struct sigaction sigint_action;
@@ -769,19 +772,19 @@ int llama_main(
769772 signal (SIGINT, sigint_handler);
770773#endif
771774
772- fprintf (stderr , " %s: interactive mode on.\n " , __func__);
775+ fprintf (errstream , " %s: interactive mode on.\n " , __func__);
773776
774777 if (antiprompt_inp.size ()) {
775- fprintf (stderr , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
776- fprintf (stderr , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
778+ fprintf (errstream , " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
779+ fprintf (errstream , " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
777780 for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
778- fprintf (stderr , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
781+ fprintf (errstream , " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
779782 }
780- fprintf (stderr , " \n " );
783+ fprintf (errstream , " \n " );
781784 }
782785 }
783- fprintf (stderr , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
784- fprintf (stderr , " \n\n " );
786+ fprintf (errstream , " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
787+ fprintf (errstream , " \n\n " );
785788
786789 std::vector<gpt_vocab::id> embd;
787790
@@ -795,7 +798,7 @@ int llama_main(
795798
796799
797800 if (params.interactive ) {
798- fprintf (stderr , " == Running in interactive mode. ==\n "
801+ fprintf (errstream , " == Running in interactive mode. ==\n "
799802#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
800803 " - Press Ctrl+C to interject at any time.\n "
801804#endif
@@ -814,7 +817,7 @@ int llama_main(
814817
815818 // set the color for the prompt which will be output initially
816819 if (params.use_color ) {
817- printf ( ANSI_COLOR_YELLOW);
820+ fprintf (outstream, ANSI_COLOR_YELLOW);
818821 }
819822
820823 while (remaining_tokens > 0 ) {
@@ -823,7 +826,7 @@ int llama_main(
823826 const int64_t t_start_us = ggml_time_us ();
824827
825828 if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
826- fprintf (stderr , " Failed to predict\n " );
829+ fprintf (errstream , " Failed to predict\n " );
827830 return 1 ;
828831 }
829832
@@ -877,16 +880,16 @@ int llama_main(
877880
878881 // reset color to default if we there is no pending user input
879882 if (!input_noecho && params.use_color && embd_inp.size () == input_consumed) {
880- printf ( ANSI_COLOR_RESET);
883+ fprintf (outstream, ANSI_COLOR_RESET);
881884 }
882885 }
883886
884887 // display text
885888 if (!input_noecho) {
886889 for (auto id : embd) {
887- printf ( " %s" , vocab.id_to_token [id].c_str ());
890+ fprintf (outstream, " %s" , vocab.id_to_token [id].c_str ());
888891 }
889- fflush (stdout );
892+ fflush (outstream );
890893 }
891894
892895 // in interactive mode, and not currently processing queued inputs;
@@ -901,16 +904,16 @@ int llama_main(
901904 // currently being interactive
902905 bool another_line=true ;
903906 while (another_line) {
904- fflush (stdout );
907+ fflush (outstream );
905908 char buf[256 ] = {0 };
906909 int n_read;
907- if (params.use_color ) printf ( ANSI_BOLD ANSI_COLOR_GREEN);
908- if (scanf ( " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
910+ if (params.use_color ) fprintf (outstream, ANSI_BOLD ANSI_COLOR_GREEN);
911+ if (fscanf (instream, " %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
909912 // presumable empty line, consume the newline
910- std::ignore = scanf ( " %*c" );
913+ std::ignore = fscanf (instream, " %*c" );
911914 n_read=0 ;
912915 }
913- if (params.use_color ) printf ( ANSI_COLOR_RESET);
916+ if (params.use_color ) fprintf (outstream, ANSI_COLOR_RESET);
914917
915918 if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
916919 another_line = true ;
@@ -936,7 +939,7 @@ int llama_main(
936939
937940 // end of text token
938941 if (embd.back () == 2 ) {
939- fprintf (stderr , " [end of text]\n " );
942+ fprintf (errstream , " [end of text]\n " );
940943 break ;
941944 }
942945 }
@@ -949,18 +952,18 @@ int llama_main(
949952 {
950953 const int64_t t_main_end_us = ggml_time_us ();
951954
952- fprintf (stderr , " \n\n " );
953- fprintf (stderr , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
954- fprintf (stderr , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
955- fprintf (stderr , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
956- fprintf (stderr , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
957- fprintf (stderr , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
955+ fprintf (errstream , " \n\n " );
956+ fprintf (errstream , " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
957+ fprintf (errstream , " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
958+ fprintf (errstream , " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
959+ fprintf (errstream , " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
960+ fprintf (errstream , " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
958961 }
959962
960963 ggml_free (model.ctx );
961964
962965 if (params.use_color ) {
963- printf ( ANSI_COLOR_RESET);
966+ fprintf (outstream, ANSI_COLOR_RESET);
964967 }
965968
966969 return 0 ;
0 commit comments